diff --git a/.buildkite/check-wheel-size.py b/.buildkite/check-wheel-size.py
index e29eb78a9f945..a378bc6baa5a5 100644
--- a/.buildkite/check-wheel-size.py
+++ b/.buildkite/check-wheel-size.py
@@ -1,12 +1,14 @@
+# SPDX-License-Identifier: Apache-2.0
+
 import os
 import sys
 import zipfile
 
-# Read the VLLM_MAX_SIZE_MB environment variable, defaulting to 300 MiB
+# Read the VLLM_MAX_SIZE_MB environment variable, defaulting to 400 MiB
 # Note that we have 400 MiB quota, please use it wisely.
 # See https://github.com/pypi/support/issues/3792 .
 # Please also sync the value with the one in Dockerfile.
-VLLM_MAX_SIZE_MB = int(os.environ.get('VLLM_MAX_SIZE_MB', 300))
+VLLM_MAX_SIZE_MB = int(os.environ.get('VLLM_MAX_SIZE_MB', 400))
 
 
 def print_top_10_largest_files(zip_file):
diff --git a/.buildkite/generate_index.py b/.buildkite/generate_index.py
index 8350e2705141e..36e1b6c01326a 100644
--- a/.buildkite/generate_index.py
+++ b/.buildkite/generate_index.py
@@ -1,3 +1,5 @@
+# SPDX-License-Identifier: Apache-2.0
+
 import argparse
 import os
 
diff --git a/.buildkite/lm-eval-harness/test_lm_eval_correctness.py b/.buildkite/lm-eval-harness/test_lm_eval_correctness.py
index afc935c1a9318..96e57dfd06475 100644
--- a/.buildkite/lm-eval-harness/test_lm_eval_correctness.py
+++ b/.buildkite/lm-eval-harness/test_lm_eval_correctness.py
@@ -1,3 +1,4 @@
+# SPDX-License-Identifier: Apache-2.0
 """
 LM eval harness on model to compare vs HF baseline computed offline.
 Configs are found in configs/$MODEL.yaml
diff --git a/.buildkite/nightly-benchmarks/scripts/convert-results-json-to-markdown.py b/.buildkite/nightly-benchmarks/scripts/convert-results-json-to-markdown.py
index 9d3646e2f6a15..e031686c7a293 100644
--- a/.buildkite/nightly-benchmarks/scripts/convert-results-json-to-markdown.py
+++ b/.buildkite/nightly-benchmarks/scripts/convert-results-json-to-markdown.py
@@ -1,3 +1,5 @@
+# SPDX-License-Identifier: Apache-2.0
+
 import json
 import os
 from pathlib import Path
diff --git a/.buildkite/nightly-benchmarks/scripts/download-tokenizer.py b/.buildkite/nightly-benchmarks/scripts/download-tokenizer.py
index 68ac5909e5951..5e17b79d26a1b 100644
--- a/.buildkite/nightly-benchmarks/scripts/download-tokenizer.py
+++ b/.buildkite/nightly-benchmarks/scripts/download-tokenizer.py
@@ -1,3 +1,5 @@
+# SPDX-License-Identifier: Apache-2.0
+
 import argparse
 
 from transformers import AutoTokenizer
diff --git a/.buildkite/nightly-benchmarks/scripts/generate-nightly-markdown.py b/.buildkite/nightly-benchmarks/scripts/generate-nightly-markdown.py
index 052060c576300..0ff95a0911b16 100644
--- a/.buildkite/nightly-benchmarks/scripts/generate-nightly-markdown.py
+++ b/.buildkite/nightly-benchmarks/scripts/generate-nightly-markdown.py
@@ -1,3 +1,5 @@
+# SPDX-License-Identifier: Apache-2.0
+
 import argparse
 import json
 from pathlib import Path
diff --git a/.buildkite/nightly-benchmarks/scripts/get-lmdeploy-modelname.py b/.buildkite/nightly-benchmarks/scripts/get-lmdeploy-modelname.py
index 18bcc3a8714c4..e5f179a0f5b68 100644
--- a/.buildkite/nightly-benchmarks/scripts/get-lmdeploy-modelname.py
+++ b/.buildkite/nightly-benchmarks/scripts/get-lmdeploy-modelname.py
@@ -1,3 +1,5 @@
+# SPDX-License-Identifier: Apache-2.0
+
 from lmdeploy.serve.openai.api_client import APIClient
 
 api_client = APIClient("http://localhost:8000")
diff --git a/.buildkite/nightly-benchmarks/scripts/summary-nightly-results.py b/.buildkite/nightly-benchmarks/scripts/summary-nightly-results.py
index 92d6fad73a94c..62ee5e10b5095 100644
--- a/.buildkite/nightly-benchmarks/scripts/summary-nightly-results.py
+++ b/.buildkite/nightly-benchmarks/scripts/summary-nightly-results.py
@@ -1,3 +1,5 @@
+# SPDX-License-Identifier: Apache-2.0
+
 import datetime
 import json
 import os
diff --git a/.buildkite/release-pipeline.yaml b/.buildkite/release-pipeline.yaml
index 51618a2955fb1..829414bf8a3ba 100644
--- a/.buildkite/release-pipeline.yaml
+++ b/.buildkite/release-pipeline.yaml
@@ -56,6 +56,11 @@ steps:
     env:
       DOCKER_BUILDKIT: "1"
 
+  - input: "Provide Release version here"
+    fields:
+      - text: "What is the release version?"
+        key: "release-version"
+
   - block: "Build CPU release image"
     key: block-cpu-release-image-build
     depends_on: ~
@@ -66,7 +71,7 @@ steps:
       queue: cpu_queue_postmerge
     commands:
       - "aws ecr-public get-login-password --region us-east-1 | docker login --username AWS --password-stdin public.ecr.aws/q9t5s3a7"
-      - "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg GIT_REPO_CHECK=1 --tag public.ecr.aws/q9t5s3a7/vllm-cpu-release-repo:$RELEASE_VERSION --progress plain -f Dockerfile.cpu ."
-      - "docker push public.ecr.aws/q9t5s3a7/vllm-cpu-release-repo:$RELEASE_VERSION"
+      - "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg GIT_REPO_CHECK=1 --tag public.ecr.aws/q9t5s3a7/vllm-cpu-release-repo:$(buildkite-agent meta-data get release-version) --progress plain -f Dockerfile.cpu ."
+      - "docker push public.ecr.aws/q9t5s3a7/vllm-cpu-release-repo:$(buildkite-agent meta-data get release-version)"
     env:
       DOCKER_BUILDKIT: "1"
diff --git a/.buildkite/run-gh200-test.sh b/.buildkite/run-gh200-test.sh
index 3e4e409466b8a..99972afa21d1e 100644
--- a/.buildkite/run-gh200-test.sh
+++ b/.buildkite/run-gh200-test.sh
@@ -23,6 +23,6 @@ trap remove_docker_container EXIT
 remove_docker_container
 
 # Run the image and test offline inference
-docker run --name gh200-test --gpus=all --entrypoint="" gh200-test bash -c '
-    python3 examples/offline_inference/basic.py
+docker run -e HF_TOKEN -v /root/.cache/huggingface:/root/.cache/huggingface --name gh200-test --gpus=all --entrypoint="" gh200-test bash -c '
+    python3 examples/offline_inference/cli.py --model meta-llama/Llama-3.2-1B
 '
diff --git a/.buildkite/run-neuron-test.sh b/.buildkite/run-neuron-test.sh
index 0590dad4f311f..1ad77cf50f612 100644
--- a/.buildkite/run-neuron-test.sh
+++ b/.buildkite/run-neuron-test.sh
@@ -54,4 +54,4 @@ docker run --rm -it --device=/dev/neuron0 --device=/dev/neuron1 --network host \
        -e "NEURON_COMPILE_CACHE_URL=${NEURON_COMPILE_CACHE_MOUNT}" \
        --name "${container_name}" \
        ${image_name} \
-       /bin/bash -c "python3 /workspace/vllm/examples/offline_inference/neuron.py"
+       /bin/bash -c "python3 /workspace/vllm/examples/offline_inference/neuron.py && python3 -m pytest /workspace/vllm/tests/neuron/ -v --capture=tee-sys"
diff --git a/.buildkite/run-tpu-test.sh b/.buildkite/run-tpu-test.sh
old mode 100644
new mode 100755
diff --git a/.buildkite/test-pipeline.yaml b/.buildkite/test-pipeline.yaml
index daec46760117d..a847a68a6ef71 100644
--- a/.buildkite/test-pipeline.yaml
+++ b/.buildkite/test-pipeline.yaml
@@ -50,9 +50,9 @@ steps:
   - tests/multimodal
   - tests/test_utils
   - tests/worker
-  - tests/standalone_tests/lazy_torch_compile.py
+  - tests/standalone_tests/lazy_imports.py
   commands:
-  - python3 standalone_tests/lazy_torch_compile.py
+  - python3 standalone_tests/lazy_imports.py
   - pytest -v -s mq_llm_engine # MQLLMEngine
   - pytest -v -s async_engine # AsyncLLMEngine
   - NUM_SCHEDULER_STEPS=4 pytest -v -s async_engine/test_async_llm_engine.py
@@ -183,7 +183,16 @@ steps:
     - vllm/
     - tests/v1
   commands:
-    - VLLM_USE_V1=1 pytest -v -s v1
+    # split the test to avoid interference
+    - VLLM_USE_V1=1 pytest -v -s v1/core
+    - VLLM_USE_V1=1 pytest -v -s v1/engine
+    - VLLM_USE_V1=1 pytest -v -s v1/sample
+    - VLLM_USE_V1=1 pytest -v -s v1/worker
+    - VLLM_USE_V1=1 pytest -v -s v1/test_stats.py
+    - VLLM_USE_V1=1 pytest -v -s v1/test_utils.py
+    # TODO: accuracy does not match, whether setting
+    # VLLM_USE_FLASHINFER_SAMPLER or not on H100.
+    - VLLM_USE_V1=1 pytest -v -s v1/e2e
 
 - label: Examples Test # 25min
   working_dir: "/vllm-workspace/examples"
@@ -340,6 +349,7 @@ steps:
   - vllm/
   - tests/models
   commands:
+    - pytest -v -s models/test_transformers.py
     - pytest -v -s models/test_registry.py
     - pytest -v -s models/test_initialization.py
 
@@ -476,6 +486,7 @@ steps:
   - VLLM_TEST_SAME_HOST=1 torchrun --nproc-per-node=4 distributed/test_same_node.py | grep 'Same node test passed'
   - TARGET_TEST_SUITE=L4 pytest basic_correctness/ -v -s -m 'distributed(num_gpus=2)'
   # Avoid importing model tests that cause CUDA reinitialization error
+  - pytest models/test_transformers.py -v -s -m 'distributed(num_gpus=2)'
   - pytest models/encoder_decoder/language/test_bart.py -v -s -m 'distributed(num_gpus=2)'
   - pytest models/encoder_decoder/vision_language/test_broadcast.py -v -s -m 'distributed(num_gpus=2)'
   - pytest models/decoder_only/vision_language/test_models.py -v -s -m 'distributed(num_gpus=2)'
diff --git a/.github/mergify.yml b/.github/mergify.yml
index ca4bd7ee2b87f..43bc5ce623d3c 100644
--- a/.github/mergify.yml
+++ b/.github/mergify.yml
@@ -35,6 +35,43 @@ pull_request_rules:
       add:
         - frontend
 
+- name: label-structured-output
+  description: Automatically apply structured-output label
+  conditions:
+    - or:
+      - files~=^vllm/model_executor/guided_decoding/
+      - files=tests/model_executor/test_guided_processors.py
+      - files=tests/entrypoints/llm/test_guided_generate.py
+      - files=benchmarks/benchmark_serving_guided.py
+      - files=benchmarks/benchmark_guided.py
+  actions:
+    label:
+      add:
+        - structured-output
+
+- name: label-speculative-decoding
+  description: Automatically apply speculative-decoding label
+  conditions:
+    - or:
+      - files~=^vllm/spec_decode/
+      - files=vllm/model_executor/layers/spec_decode_base_sampler.py
+      - files~=^tests/spec_decode/
+  actions:
+    label:
+      add:
+        - speculative-decoding
+
+- name: label-v1
+  description: Automatically apply v1 label
+  conditions:
+    - or:
+      - files~=^vllm/v1/
+      - files~=^tests/v1/
+  actions:
+    label:
+      add:
+        - v1
+
 - name: ping author on conflicts and add 'needs-rebase' label
   conditions:
       - conflict
diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
index 432bf5ed18dbc..4568efcbba211 100644
--- a/.pre-commit-config.yaml
+++ b/.pre-commit-config.yaml
@@ -3,18 +3,18 @@ default_stages:
   - manual # Run in CI
 repos:
 - repo: https://github.com/google/yapf
-  rev: v0.32.0
+  rev: v0.43.0
   hooks:
   - id: yapf
     args: [--in-place, --verbose]
     additional_dependencies: [toml] # TODO: Remove when yapf is upgraded
 - repo: https://github.com/astral-sh/ruff-pre-commit
-  rev: v0.6.5
+  rev: v0.9.3
   hooks:
   - id: ruff
     args: [--output-format, github]
 - repo: https://github.com/codespell-project/codespell
-  rev: v2.3.0
+  rev: v2.4.0
   hooks:
   - id: codespell
     exclude: 'benchmarks/sonnet.txt|(build|tests/(lora/data|models/fixtures|prompts))/.*'
@@ -23,7 +23,7 @@ repos:
   hooks:
   - id: isort
 - repo: https://github.com/pre-commit/mirrors-clang-format
-  rev: v18.1.5
+  rev: v19.1.7
   hooks:
   - id: clang-format
     exclude: 'csrc/(moe/topk_softmax_kernels.cu|quantization/gguf/(ggml-common.h|dequantize.cuh|vecdotq.cuh|mmq.cuh|mmvq.cuh))'
@@ -35,7 +35,7 @@ repos:
   - id: pymarkdown
     files: docs/.*
 - repo: https://github.com/rhysd/actionlint
-  rev: v1.7.6
+  rev: v1.7.7
   hooks:
   - id: actionlint
 - repo: local
@@ -85,8 +85,26 @@ repos:
     entry: tools/png-lint.sh
     language: script
     types: [png]
+  - id: signoff-commit
+    name: Sign-off Commit
+    entry: bash
+    args:
+      - -c
+      - |
+        if ! grep -q "^Signed-off-by: $(git config user.name) <$(git config user.email)>" .git/COMMIT_EDITMSG; then
+          printf "\nSigned-off-by: $(git config user.name) <$(git config user.email)>\n" >> .git/COMMIT_EDITMSG
+        fi
+    language: system
+    verbose: true
+    stages: [commit-msg]
+  - id: check-spdx-header
+    name: Check SPDX headers
+    entry: python tools/check_spdx_header.py
+    language: python
+    types: [python]
   - id: suggestion
     name: Suggestion
     entry: bash -c 'echo "To bypass pre-commit hooks, add --no-verify to git commit."'
     language: system
     verbose: true
+    pass_filenames: false
diff --git a/CMakeLists.txt b/CMakeLists.txt
old mode 100644
new mode 100755
index 5039ac2448f83..c823c9ff895c3
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -245,7 +245,7 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
     FetchContent_Declare(
         cutlass
         GIT_REPOSITORY https://github.com/nvidia/cutlass.git
-        GIT_TAG v3.6.0
+        GIT_TAG v3.7.0
         GIT_PROGRESS TRUE
 
         # Speed up CUTLASS download by retrieving only the specified GIT_TAG instead of the history.
@@ -275,7 +275,7 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
   # Only build Marlin kernels if we are building for at least some compatible archs.
   # Keep building Marlin for 9.0 as there are some group sizes and shapes that
   # are not supported by Machete yet.
-  cuda_archs_loose_intersection(MARLIN_ARCHS "8.0;8.6;8.7;8.9;9.0" ${CUDA_ARCHS})
+  cuda_archs_loose_intersection(MARLIN_ARCHS "8.0;8.6;8.7;8.9;9.0" "${CUDA_ARCHS}")
   if (MARLIN_ARCHS)
     set(MARLIN_SRCS
        "csrc/quantization/fp8/fp8_marlin.cu"
@@ -296,10 +296,15 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
   endif()
 
   # The cutlass_scaled_mm kernels for Hopper (c3x, i.e. CUTLASS 3.x) require
-  # CUDA 12.0 or later (and only work on Hopper, 9.0/9.0a for now).
-  cuda_archs_loose_intersection(SCALED_MM_3X_ARCHS "9.0;9.0a" "${CUDA_ARCHS}")
+  # CUDA 12.0 or later (and only work on Hopper, 9.0a for now).
+  cuda_archs_loose_intersection(SCALED_MM_3X_ARCHS "9.0a" "${CUDA_ARCHS}")
   if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER 12.0 AND SCALED_MM_3X_ARCHS)
-    set(SRCS "csrc/quantization/cutlass_w8a8/scaled_mm_c3x.cu")
+    set(SRCS 
+       "csrc/quantization/cutlass_w8a8/scaled_mm_c3x.cu"
+       "csrc/quantization/cutlass_w8a8/c3x/scaled_mm_sm90_fp8.cu"
+       "csrc/quantization/cutlass_w8a8/c3x/scaled_mm_sm90_int8.cu"
+       "csrc/quantization/cutlass_w8a8/c3x/scaled_mm_azp_sm90_int8.cu"
+       "csrc/quantization/cutlass_w8a8/c3x/scaled_mm_blockwise_sm90_fp8.cu")
     set_gencode_flags_for_srcs(
       SRCS "${SRCS}"
       CUDA_ARCHS "${SCALED_MM_3X_ARCHS}")
@@ -351,7 +356,7 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
   # 2:4 Sparse Kernels
 
   # The 2:4 sparse kernels cutlass_scaled_sparse_mm and cutlass_compressor
-  # require CUDA 12.2 or later (and only work on Hopper, 9.0/9.0a for now).
+  # require CUDA 12.2 or later (and only work on Hopper, 9.0a for now).
   if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER 12.2 AND SCALED_MM_3X_ARCHS)
     set(SRCS "csrc/sparse/cutlass/sparse_compressor_c3x.cu"
              "csrc/sparse/cutlass/sparse_scaled_mm_c3x.cu")
@@ -576,7 +581,7 @@ else()
   FetchContent_Declare(
           vllm-flash-attn
           GIT_REPOSITORY https://github.com/vllm-project/flash-attention.git
-          GIT_TAG 90eacc1af2a7c3de62ea249e929ed5faccf38954
+          GIT_TAG d4e09037abf588af1ec47d0e966b237ee376876c
           GIT_PROGRESS TRUE
           # Don't share the vllm-flash-attn build between build types
           BINARY_DIR ${CMAKE_BINARY_DIR}/vllm-flash-attn
diff --git a/CODE_OF_CONDUCT.md b/CODE_OF_CONDUCT.md
index f801b5f8f5513..1a9596841cc65 100644
--- a/CODE_OF_CONDUCT.md
+++ b/CODE_OF_CONDUCT.md
@@ -61,7 +61,7 @@ representative at an online or offline/IRL event.
 
 Instances of abusive, harassing, or otherwise unacceptable behavior may be
 reported to the community leaders responsible for enforcement in the #code-of-conduct
-channel in the [vLLM Discord](https://discord.com/invite/jz7wjKhh6g).
+channel in the [vLLM Slack](https://slack.vllm.ai).
 All complaints will be reviewed and investigated promptly and fairly.
 
 All community leaders are obligated to respect the privacy and security of the
diff --git a/Dockerfile b/Dockerfile
index cb9cf0da5be65..7ecb643f46272 100644
--- a/Dockerfile
+++ b/Dockerfile
@@ -127,7 +127,7 @@ RUN --mount=type=cache,target=/root/.cache/ccache \
 # Check the size of the wheel if RUN_WHEEL_CHECK is true
 COPY .buildkite/check-wheel-size.py check-wheel-size.py
 # sync the default value with .buildkite/check-wheel-size.py
-ARG VLLM_MAX_SIZE_MB=300
+ARG VLLM_MAX_SIZE_MB=400
 ENV VLLM_MAX_SIZE_MB=$VLLM_MAX_SIZE_MB
 ARG RUN_WHEEL_CHECK=true
 RUN if [ "$RUN_WHEEL_CHECK" = "true" ]; then \
@@ -149,7 +149,8 @@ RUN --mount=type=cache,target=/root/.cache/pip \
 
 #################### vLLM installation IMAGE ####################
 # image with vLLM installed
-FROM nvidia/cuda:${CUDA_VERSION}-base-ubuntu22.04 AS vllm-base
+# TODO: Restore to base image after FlashInfer AOT wheel fixed
+FROM nvidia/cuda:${CUDA_VERSION}-devel-ubuntu22.04 AS vllm-base
 ARG CUDA_VERSION=12.4.1
 ARG PYTHON_VERSION=3.12
 WORKDIR /vllm-workspace
@@ -194,12 +195,30 @@ RUN --mount=type=bind,from=build,src=/workspace/dist,target=/vllm-workspace/dist
     --mount=type=cache,target=/root/.cache/pip \
     python3 -m pip install dist/*.whl --verbose
 
+# How to build this FlashInfer wheel:
+# $ export FLASHINFER_ENABLE_AOT=1
+# $ # Note we remove 7.0 from the arch list compared to the list below, since FlashInfer only supports sm75+
+# $ export TORCH_CUDA_ARCH_LIST='7.5 8.0 8.6 8.9 9.0+PTX'
+# $ git clone https://github.com/flashinfer-ai/flashinfer.git --recursive
+# $ cd flashinfer
+# $ git checkout 524304395bd1d8cd7d07db083859523fcaa246a4
+# $ python3 setup.py bdist_wheel --dist-dir=dist --verbose
+
 RUN --mount=type=cache,target=/root/.cache/pip \
 . /etc/environment && \
 if [ "$TARGETPLATFORM" != "linux/arm64" ]; then \
-    python3 -m pip install https://github.com/flashinfer-ai/flashinfer/releases/download/v0.1.6/flashinfer-0.1.6+cu121torch2.4-cp${PYTHON_VERSION_STR}-cp${PYTHON_VERSION_STR}-linux_x86_64.whl; \
+    python3 -m pip install https://wheels.vllm.ai/flashinfer/524304395bd1d8cd7d07db083859523fcaa246a4/flashinfer_python-0.2.0.post1-cp${PYTHON_VERSION_STR}-cp${PYTHON_VERSION_STR}-linux_x86_64.whl; \
 fi
 COPY examples examples
+
+# Although we build Flashinfer with AOT mode, there's still
+# some issues w.r.t. JIT compilation. Therefore we need to
+# install build dependencies for JIT compilation.
+# TODO: Remove this once FlashInfer AOT wheel is fixed
+COPY requirements-build.txt requirements-build.txt
+RUN --mount=type=cache,target=/root/.cache/pip \
+    python3 -m pip install -r requirements-build.txt
+
 #################### vLLM installation IMAGE ####################
 
 #################### TEST IMAGE ####################
diff --git a/Dockerfile.ppc64le b/Dockerfile.ppc64le
index d3cd1c7b313bc..c4c1f3e357972 100644
--- a/Dockerfile.ppc64le
+++ b/Dockerfile.ppc64le
@@ -4,12 +4,12 @@ USER root
 
 ENV PATH="/usr/local/cargo/bin:$PATH:/opt/conda/bin/"
 
-RUN apt-get update -y && apt-get install -y git wget curl vim libnuma-dev libsndfile-dev libprotobuf-dev build-essential ffmpeg libsm6 libxext6 libgl1 libssl-dev 
+RUN apt-get update -y && apt-get install -y git wget kmod curl vim libnuma-dev libsndfile-dev libprotobuf-dev build-essential ffmpeg libsm6 libxext6 libgl1 libssl-dev 
 
 # Some packages in requirements-cpu are installed here
 # IBM provides optimized packages for ppc64le processors in the open-ce project for mamba
 # Currently these may not be available for venv or pip directly
-RUN micromamba install -y -n base -c https://ftp.osuosl.org/pub/open-ce/1.11.0-p10/ -c defaults python=3.10 torchvision-cpu=0.16.2 rust && micromamba clean --all --yes
+RUN micromamba install -y -n base -c https://ftp.osuosl.org/pub/open-ce/1.11.0-p10/ -c defaults python=3.10 rust && micromamba clean --all --yes
 
 COPY ./ /workspace/vllm
 
@@ -21,7 +21,6 @@ RUN --mount=type=bind,source=.git,target=.git \
 RUN --mount=type=cache,target=/root/.cache/pip  \
     RUSTFLAGS='-L /opt/conda/lib' pip install -v --prefer-binary --extra-index-url https://repo.fury.io/mgiessing \
         'cmake>=3.26' ninja packaging 'setuptools-scm>=8' wheel jinja2 \
-        torch==2.3.1 \
         -r requirements-cpu.txt \
         xformers uvloop==0.20.0
 
diff --git a/Dockerfile.tpu b/Dockerfile.tpu
index ee0d94d98e82b..e268b39476665 100644
--- a/Dockerfile.tpu
+++ b/Dockerfile.tpu
@@ -1,4 +1,4 @@
-ARG NIGHTLY_DATE="20250122"
+ARG NIGHTLY_DATE="20250124"
 ARG BASE_IMAGE="us-central1-docker.pkg.dev/tpu-pytorch-releases/docker/xla:nightly_3.10_tpuvm_$NIGHTLY_DATE"
 
 FROM $BASE_IMAGE
diff --git a/README.md b/README.md
index 4ed905bf7aa9d..09c2c6d35e6bd 100644
--- a/README.md
+++ b/README.md
@@ -10,12 +10,13 @@ Easy, fast, and cheap LLM serving for everyone
 </h3>
 
 <p align="center">
-| <a href="https://docs.vllm.ai"><b>Documentation</b></a> | <a href="https://vllm.ai"><b>Blog</b></a> | <a href="https://arxiv.org/abs/2309.06180"><b>Paper</b></a> | <a href="https://discord.gg/jz7wjKhh6g"><b>Discord</b></a> | <a href="https://x.com/vllm_project"><b>Twitter/X</b></a> | <a href="https://slack.vllm.ai"><b>Developer Slack</b></a> |
+| <a href="https://docs.vllm.ai"><b>Documentation</b></a> | <a href="https://vllm.ai"><b>Blog</b></a> | <a href="https://arxiv.org/abs/2309.06180"><b>Paper</b></a> | <a href="https://x.com/vllm_project"><b>Twitter/X</b></a> | <a href="https://slack.vllm.ai"><b>Developer Slack</b></a> |
 </p>
 
 ---
 
 *Latest News* 🔥
+- [2025/01] We are excited to announce the alpha release of vLLM V1: A major architectural upgrade with 1.7x speedup! Clean code, optimized execution loop, zero-overhead prefix caching, enhanced multimodal support, and more. Please check out our blog post [here](https://blog.vllm.ai/2025/01/27/v1-alpha-release.html).
 - [2025/01] We hosted [the eighth vLLM meetup](https://lu.ma/zep56hui) with Google Cloud! Please find the meetup slides from vLLM team [here](https://docs.google.com/presentation/d/1epVkt4Zu8Jz_S5OhEHPc798emsYh2BwYfRuDDVEF7u4/edit?usp=sharing).
 - [2024/12] vLLM joins [pytorch ecosystem](https://pytorch.org/blog/vllm-joins-pytorch)! Easy, Fast, and Cheap LLM Serving for Everyone!
 - [2024/11] We hosted [the seventh vLLM meetup](https://lu.ma/h0qvrajz) with Snowflake! Please find the meetup slides from vLLM team [here](https://docs.google.com/presentation/d/1e3CxQBV3JsfGp30SwyvS3eM_tW-ghOhJ9PAJGK6KR54/edit?usp=sharing), and Snowflake team [here](https://docs.google.com/presentation/d/1qF3RkDAbOULwz9WK5TOltt2fE9t6uIc_hVNLFAaQX6A/edit?usp=sharing).
@@ -35,7 +36,7 @@ Easy, fast, and cheap LLM serving for everyone
 ## About
 vLLM is a fast and easy-to-use library for LLM inference and serving.
 
-Originally developed in the [Sky Computing Lab](https://sky.cs.berkeley.edu) at UC Berkeley, vLLM has evloved into a community-driven project with contributions from both academia and industry.
+Originally developed in the [Sky Computing Lab](https://sky.cs.berkeley.edu) at UC Berkeley, vLLM has evolved into a community-driven project with contributions from both academia and industry.
 
 vLLM is fast with:
 
@@ -138,8 +139,7 @@ If you use vLLM for your research, please cite our [paper](https://arxiv.org/abs
 ## Contact Us
 
 * For technical questions and feature requests, please use Github issues or discussions.
-* For discussing with fellow users, please use Discord.
-* For coordinating contributions and development, please use Slack.
+* For discussing with fellow users and coordinating contributions and development, please use Slack.
 * For security disclosures, please use Github's security advisory feature.
 * For collaborations and partnerships, please contact us at vllm-questions AT lists.berkeley.edu.
 
diff --git a/benchmarks/backend_request_func.py b/benchmarks/backend_request_func.py
index 0612e8778aca5..364b087b841d3 100644
--- a/benchmarks/backend_request_func.py
+++ b/benchmarks/backend_request_func.py
@@ -1,3 +1,5 @@
+# SPDX-License-Identifier: Apache-2.0
+
 import json
 import os
 import sys
diff --git a/benchmarks/benchmark_guided.py b/benchmarks/benchmark_guided.py
index 1a0e62598bfcb..2b41834baf4d6 100644
--- a/benchmarks/benchmark_guided.py
+++ b/benchmarks/benchmark_guided.py
@@ -1,3 +1,4 @@
+# SPDX-License-Identifier: Apache-2.0
 """Benchmark guided decoding throughput."""
 import argparse
 import dataclasses
diff --git a/benchmarks/benchmark_latency.py b/benchmarks/benchmark_latency.py
index 77c4f6aa927e4..896312945312c 100644
--- a/benchmarks/benchmark_latency.py
+++ b/benchmarks/benchmark_latency.py
@@ -1,3 +1,4 @@
+# SPDX-License-Identifier: Apache-2.0
 """Benchmark the latency of processing a single batch of requests."""
 import argparse
 import dataclasses
diff --git a/benchmarks/benchmark_long_document_qa_throughput.py b/benchmarks/benchmark_long_document_qa_throughput.py
index 0b8fba38156f1..21480578edbd5 100644
--- a/benchmarks/benchmark_long_document_qa_throughput.py
+++ b/benchmarks/benchmark_long_document_qa_throughput.py
@@ -1,3 +1,4 @@
+# SPDX-License-Identifier: Apache-2.0
 """
 Offline benchmark to test the long document QA throughput.
 
diff --git a/benchmarks/benchmark_prefix_caching.py b/benchmarks/benchmark_prefix_caching.py
index 3ab421a89c935..23822856b8825 100644
--- a/benchmarks/benchmark_prefix_caching.py
+++ b/benchmarks/benchmark_prefix_caching.py
@@ -1,3 +1,4 @@
+# SPDX-License-Identifier: Apache-2.0
 """
 Benchmark the efficiency of prefix caching.
 
diff --git a/benchmarks/benchmark_prioritization.py b/benchmarks/benchmark_prioritization.py
index e0c9e6a6db502..a32065e4e7c0f 100644
--- a/benchmarks/benchmark_prioritization.py
+++ b/benchmarks/benchmark_prioritization.py
@@ -1,3 +1,4 @@
+# SPDX-License-Identifier: Apache-2.0
 """Benchmark offline prioritization."""
 import argparse
 import dataclasses
diff --git a/benchmarks/benchmark_serving.py b/benchmarks/benchmark_serving.py
index 63d2c3f7c7dd9..e934d228f7fd4 100644
--- a/benchmarks/benchmark_serving.py
+++ b/benchmarks/benchmark_serving.py
@@ -1,3 +1,4 @@
+# SPDX-License-Identifier: Apache-2.0
 r"""Benchmark online serving throughput.
 
 On the server side, run one of the following commands:
@@ -926,8 +927,8 @@ def main(args: argparse.Namespace):
                     )
 
         # Traffic
-        result_json["request_rate"] = (
-            args.request_rate if args.request_rate < float("inf") else "inf")
+        result_json["request_rate"] = (args.request_rate if args.request_rate
+                                       < float("inf") else "inf")
         result_json["burstiness"] = args.burstiness
         result_json["max_concurrency"] = args.max_concurrency
 
diff --git a/benchmarks/benchmark_serving_guided.py b/benchmarks/benchmark_serving_guided.py
index 4435d87e18a8a..561e500d8b6c4 100644
--- a/benchmarks/benchmark_serving_guided.py
+++ b/benchmarks/benchmark_serving_guided.py
@@ -1,3 +1,4 @@
+# SPDX-License-Identifier: Apache-2.0
 r"""Benchmark online serving throughput with guided decoding.
 
 On the server side, run one of the following commands:
diff --git a/benchmarks/benchmark_throughput.py b/benchmarks/benchmark_throughput.py
index c1b10b3cf8f58..658eab6a278c8 100644
--- a/benchmarks/benchmark_throughput.py
+++ b/benchmarks/benchmark_throughput.py
@@ -1,3 +1,4 @@
+# SPDX-License-Identifier: Apache-2.0
 """Benchmark offline inference throughput."""
 import argparse
 import dataclasses
diff --git a/benchmarks/cutlass_benchmarks/sparse_benchmarks.py b/benchmarks/cutlass_benchmarks/sparse_benchmarks.py
index 3d1c5e392f9e2..468a1b2868f0c 100644
--- a/benchmarks/cutlass_benchmarks/sparse_benchmarks.py
+++ b/benchmarks/cutlass_benchmarks/sparse_benchmarks.py
@@ -1,3 +1,5 @@
+# SPDX-License-Identifier: Apache-2.0
+
 import argparse
 import copy
 import itertools
diff --git a/benchmarks/cutlass_benchmarks/utils.py b/benchmarks/cutlass_benchmarks/utils.py
index ef06fcd6604dd..bab377800729b 100644
--- a/benchmarks/cutlass_benchmarks/utils.py
+++ b/benchmarks/cutlass_benchmarks/utils.py
@@ -1,3 +1,5 @@
+# SPDX-License-Identifier: Apache-2.0
+
 # Cutlass bench utils
 from typing import Iterable, Tuple
 
diff --git a/benchmarks/cutlass_benchmarks/w8a8_benchmarks.py b/benchmarks/cutlass_benchmarks/w8a8_benchmarks.py
index d0353bc8cb42a..6552b62dae881 100644
--- a/benchmarks/cutlass_benchmarks/w8a8_benchmarks.py
+++ b/benchmarks/cutlass_benchmarks/w8a8_benchmarks.py
@@ -1,9 +1,11 @@
+# SPDX-License-Identifier: Apache-2.0
+
 import argparse
 import copy
 import itertools
 import pickle as pkl
 import time
-from typing import Callable, Iterable, List, Tuple
+from typing import Callable, Iterable, List, Optional, Tuple
 
 import torch
 import torch.utils.benchmark as TBenchmark
@@ -12,6 +14,8 @@ from utils import make_rand_tensors
 from weight_shapes import WEIGHT_SHAPES
 
 from vllm import _custom_ops as ops
+from vllm.model_executor.layers.quantization.utils.fp8_utils import (
+    w8a8_block_fp8_matmul)
 from vllm.utils import FlexibleArgumentParser
 
 DEFAULT_MODELS = list(WEIGHT_SHAPES.keys())
@@ -38,8 +42,15 @@ def bench_fn(label: str, sub_label: str, description: str, fn: Callable, *args,
     ).blocked_autorange(min_run_time=min_run_time)
 
 
-def bench_int8(dtype: torch.dtype, m: int, k: int, n: int, label: str,
-               sub_label: str) -> Iterable[TMeasurement]:
+def bench_int8(
+        dtype: torch.dtype,
+        m: int,
+        k: int,
+        n: int,
+        label: str,
+        sub_label: str,
+        bench_kernels: Optional[List[str]] = None) -> Iterable[TMeasurement]:
+    """Benchmark INT8-based kernels."""
     assert dtype == torch.int8
     a, b = make_rand_tensors(torch.int8, m, n, k)
     scale_a = torch.tensor(1.0, device="cuda", dtype=torch.float32)
@@ -48,155 +59,132 @@ def bench_int8(dtype: torch.dtype, m: int, k: int, n: int, label: str,
     azp = torch.zeros((m, ), device="cuda", dtype=torch.int32)
     azp_adj = torch.zeros((n, ), device="cuda", dtype=torch.int32)
 
+    bench_fns = {
+        "pytorch_bf16_bf16_bf16_matmul-no-scales":
+        lambda: torch.mm(a.to(dtype=torch.bfloat16), b.to(dtype=torch.bfloat16)
+                         ),
+        "pytorch_fp16_fp16_fp16_matmul-no-scales":
+        lambda: torch.mm(a.to(dtype=torch.float16), b.to(dtype=torch.float16)),
+        "cutlass_i8_i8_bf16_scaled_mm":
+        lambda: ops.cutlass_scaled_mm(a, b, scale_a, scale_b, torch.bfloat16),
+        "cutlass_i8_i8_bf16_scaled_mm_bias":
+        lambda: ops.cutlass_scaled_mm(a, b, scale_a, scale_b, torch.bfloat16,
+                                      bias),
+        "cutlass_i8_i8_bf16_scaled_mm_azp":
+        lambda: ops.cutlass_scaled_mm_azp(a, b, scale_a, scale_b, torch.
+                                          bfloat16, azp_adj),
+        "cutlass_i8_i8_bf16_scaled_mm_azp_bias":
+        lambda: ops.cutlass_scaled_mm_azp(a, b, scale_a, scale_b, torch.
+                                          bfloat16, azp_adj, None, bias),
+        "cutlass_i8_i8_bf16_scaled_mm_azp_pt":
+        lambda: ops.cutlass_scaled_mm_azp(a, b, scale_a, scale_b, torch.
+                                          bfloat16, azp_adj, azp),
+        "cutlass_i8_i8_bf16_scaled_mm_azp_pt_bias":
+        lambda: ops.cutlass_scaled_mm_azp(a, b, scale_a, scale_b, torch.
+                                          bfloat16, azp_adj, azp, bias),
+    }
+
     timers = []
-    # pytorch impl - bfloat16
-    timers.append(
-        bench_fn(label, sub_label, "pytorch_bf16_bf16_bf16_matmul-no-scales",
-                 torch.mm, a.to(dtype=torch.bfloat16),
-                 b.to(dtype=torch.bfloat16)))
-
-    # pytorch impl - float16
-    timers.append(
-        bench_fn(label, sub_label,
-                 "pytorch_fp16_fp16_fp16_matmul-no-scales", torch.mm,
-                 a.to(dtype=torch.float16), b.to(dtype=torch.float16)))
-
-    # cutlass impl
-    timers.append(
-        bench_fn(label, sub_label, "cutlass_i8_i8_bf16_scaled_mm",
-                 ops.cutlass_scaled_mm, a, b, scale_a, scale_b,
-                 torch.bfloat16))
-
-    # cutlass with bias
-    timers.append(
-        bench_fn(label, sub_label, "cutlass_i8_i8_bf16_scaled_mm_bias",
-                 ops.cutlass_scaled_mm, a, b, scale_a, scale_b, torch.bfloat16,
-                 bias))
-
-    # cutlass with azp per-tensor
-    timers.append(
-        bench_fn(label, sub_label, "cutlass_i8_i8_bf16_scaled_mm_azp",
-                 ops.cutlass_scaled_mm_azp, a, b, scale_a, scale_b,
-                 torch.bfloat16, azp_adj))
-
-    # cutlass with azp per-tensor + bias
-    timers.append(
-        bench_fn(label, sub_label, "cutlass_i8_i8_bf16_scaled_mm_azp_bias",
-                 ops.cutlass_scaled_mm_azp, a, b, scale_a, scale_b,
-                 torch.bfloat16, azp_adj, None, bias))
-
-    # cutlass with azp per-token
-    timers.append(
-        bench_fn(label, sub_label, "cutlass_i8_i8_bf16_scaled_mm_azp_pt",
-                 ops.cutlass_scaled_mm_azp, a, b, scale_a, scale_b,
-                 torch.bfloat16, azp_adj, azp))
-
-    # cutlass with azp per-token + bias
-    timers.append(
-        bench_fn(label, sub_label, "cutlass_i8_i8_bf16_scaled_mm_azp_pt_bias",
-                 ops.cutlass_scaled_mm_azp, a, b, scale_a, scale_b,
-                 torch.bfloat16, azp_adj, azp, bias))
+    for name, fn in bench_fns.items():
+        # If bench_kernels is None, run all. Otherwise, run only exact matches.
+        if bench_kernels is None or name in bench_kernels:
+            print(f"Running {name}")
+            timers.append(bench_fn(label, sub_label, name, fn))
 
     return timers
 
 
-def bench_fp8(dtype: torch.dtype, m: int, k: int, n: int, label: str,
-              sub_label: str) -> Iterable[TMeasurement]:
+def bench_fp8(
+        dtype: torch.dtype,
+        m: int,
+        k: int,
+        n: int,
+        label: str,
+        sub_label: str,
+        bench_kernels: Optional[List[str]] = None) -> Iterable[TMeasurement]:
+    """Benchmark FP8-based kernels."""
     assert dtype == torch.float8_e4m3fn
     a, b = make_rand_tensors(torch.float8_e4m3fn, m, n, k)
+    a_cont = a.contiguous()
     scale_a = torch.tensor(1.0, device="cuda", dtype=torch.float32)
     scale_b = torch.tensor(1.0, device="cuda", dtype=torch.float32)
+    block_scale_a = torch.rand((m, k // 128),
+                               device="cuda",
+                               dtype=torch.float32)
+    block_scale_b = torch.rand((k // 128, n // 128),
+                               device="cuda",
+                               dtype=torch.float32)
+    block_scale_a_M_major = block_scale_a.t().contiguous().t()
+    block_scale_b_K_major = block_scale_b.t().contiguous().t()
     bias = torch.zeros((n, ), device="cuda", dtype=torch.bfloat16)
 
+    print(m, k, n)
+
+    bench_fns = {
+        "pytorch_bf16_bf16_bf16_matmul-no-scales":
+        lambda: torch.mm(a.to(dtype=torch.bfloat16), b.to(dtype=torch.bfloat16)
+                         ),
+        "pytorch_fp16_fp16_fp16_matmul-no-scales":
+        lambda: torch.mm(a.to(dtype=torch.float16), b.to(dtype=torch.float16)),
+        "pytorch_fp8_fp8_fp16_scaled_mm":
+        lambda: torch._scaled_mm(
+            a, b, scale_a, scale_b, out_dtype=torch.float16),
+        "pytorch_fp8_fp8_fp16_scaled_mm_fast_accum":
+        lambda: torch._scaled_mm(a,
+                                 b,
+                                 scale_a,
+                                 scale_b,
+                                 out_dtype=torch.float16,
+                                 use_fast_accum=True),
+        "pytorch_fp8_fp8_bf16_scaled_mm":
+        lambda: torch._scaled_mm(
+            a, b, scale_a, scale_b, out_dtype=torch.bfloat16),
+        "pytorch_fp8_fp8_bf16_scaled_mm_fast_accum":
+        lambda: torch._scaled_mm(a,
+                                 b,
+                                 scale_a,
+                                 scale_b,
+                                 out_dtype=torch.bfloat16,
+                                 use_fast_accum=True),
+        "cutlass_fp8_fp8_bf16_scaled_mm":
+        lambda: ops.cutlass_scaled_mm(a, b, scale_a, scale_b, torch.bfloat16),
+        "cutlass_fp8_fp8_fp16_scaled_mm":
+        lambda: ops.cutlass_scaled_mm(a, b, scale_a, scale_b, torch.float16),
+        "cutlass_fp8_fp8_bf16_scaled_mm_bias":
+        lambda: ops.cutlass_scaled_mm(a, b, scale_a, scale_b, torch.bfloat16,
+                                      bias),
+        "cutlass_fp8_fp8_fp16_scaled_mm_bias":
+        lambda: ops.cutlass_scaled_mm(a, b, scale_a, scale_b, torch.float16,
+                                      bias.to(dtype=torch.float16)),
+        "triton_fp8_fp8_fp16_scaled_mm_blockwise":
+        lambda: w8a8_block_fp8_matmul(a_cont, b.t(), block_scale_a,
+                                      block_scale_b.t(), (128, 128)),
+        "cutlass_fp8_fp8_fp16_scaled_mm_blockwise":
+        lambda: ops.cutlass_scaled_mm(a, b, block_scale_a_M_major,
+                                      block_scale_b_K_major, torch.float16),
+    }
+
     timers = []
-
-    # pytorch impl w. bf16
-    timers.append(
-        bench_fn(label, sub_label, "pytorch_bf16_bf16_bf16_matmul-no-scales",
-                 torch.mm, a.to(dtype=torch.bfloat16, device="cuda"),
-                 b.to(dtype=torch.bfloat16, device="cuda")))
-
-    # pytorch impl: bf16 output, without fp8 fast accum
-    timers.append(
-        bench_fn(label,
-                 sub_label,
-                 "pytorch_fp8_fp8_bf16_scaled_mm",
-                 torch._scaled_mm,
-                 a,
-                 b,
-                 scale_a=scale_a,
-                 scale_b=scale_b,
-                 out_dtype=torch.bfloat16))
-
-    # pytorch impl: bf16 output, with fp8 fast accum
-    timers.append(
-        bench_fn(label,
-                 sub_label,
-                 "pytorch_fp8_fp8_bf16_scaled_mm_fast_accum",
-                 torch._scaled_mm,
-                 a,
-                 b,
-                 scale_a=scale_a,
-                 scale_b=scale_b,
-                 out_dtype=torch.bfloat16,
-                 use_fast_accum=True))
-
-    # pytorch impl: fp16 output, without fp8 fast accum
-    timers.append(
-        bench_fn(label,
-                 sub_label,
-                 "pytorch_fp8_fp8_fp16_scaled_mm",
-                 torch._scaled_mm,
-                 a,
-                 b,
-                 scale_a=scale_a,
-                 scale_b=scale_b,
-                 out_dtype=torch.float16))
-
-    # pytorch impl: fp16 output, with fp8 fast accum
-    timers.append(
-        bench_fn(label,
-                 sub_label,
-                 "pytorch_fp8_fp8_fp16_scaled_mm_fast_accum",
-                 torch._scaled_mm,
-                 a,
-                 b,
-                 scale_a=scale_a,
-                 scale_b=scale_b,
-                 out_dtype=torch.float16,
-                 use_fast_accum=True))
-
-    # cutlass impl: bf16 output
-    timers.append(
-        bench_fn(label, sub_label, "cutlass_fp8_fp8_bf16_scaled_mm",
-                 ops.cutlass_scaled_mm, a, b, scale_a, scale_b,
-                 torch.bfloat16))
-    # cutlass impl: fp16 output
-    timers.append(
-        bench_fn(label, sub_label, "cutlass_fp8_fp8_fp16_scaled_mm",
-                 ops.cutlass_scaled_mm, a, b, scale_a, scale_b, torch.float16))
-
-    # cutlass impl: bf16 output, with bias
-    timers.append(
-        bench_fn(label, sub_label, "cutlass_fp8_fp8_bf16_scaled_mm_bias",
-                 ops.cutlass_scaled_mm, a, b, scale_a, scale_b, torch.bfloat16,
-                 bias))
-
-    # cutlass impl: fp16 output, with bias
-    timers.append(
-        bench_fn(label, sub_label, "cutlass_fp8_fp8_fp16_scaled_mm_bias",
-                 ops.cutlass_scaled_mm, a, b, scale_a, scale_b, torch.float16,
-                 bias.to(dtype=torch.float16)))
+    for name, fn in bench_fns.items():
+        # If bench_kernels is None, run all. Otherwise, run only exact matches.
+        if bench_kernels is None or name in bench_kernels:
+            print(f"Running {name}")
+            timers.append(bench_fn(label, sub_label, name, fn))
 
     return timers
 
 
-def bench(dtype: torch.dtype, m: int, k: int, n: int, label: str,
-          sub_label: str) -> Iterable[TMeasurement]:
+def bench(dtype: torch.dtype,
+          m: int,
+          k: int,
+          n: int,
+          label: str,
+          sub_label: str,
+          bench_kernels: Optional[List[str]] = None) -> Iterable[TMeasurement]:
     if dtype == torch.int8:
-        return bench_int8(dtype, m, k, n, label, sub_label)
+        return bench_int8(dtype, m, k, n, label, sub_label, bench_kernels)
     if dtype == torch.float8_e4m3fn:
-        return bench_fp8(dtype, m, k, n, label, sub_label)
+        return bench_fp8(dtype, m, k, n, label, sub_label, bench_kernels)
     raise ValueError("unsupported type")
 
 
@@ -207,18 +195,22 @@ def print_timers(timers: Iterable[TMeasurement]):
 
 
 def run(dtype: torch.dtype,
-        MKNs: Iterable[Tuple[int, int, int]]) -> Iterable[TMeasurement]:
+        MKNs: Iterable[Tuple[int, int, int]],
+        bench_kernels: Optional[List[str]] = None) -> Iterable[TMeasurement]:
     results = []
     for m, k, n in MKNs:
-        timers = bench(dtype, m, k, n, f"scaled-{dtype}-gemm",
-                       f"MKN=({m}x{k}x{n})")
+        timers = bench(dtype,
+                       m,
+                       k,
+                       n,
+                       f"scaled-{dtype}-gemm",
+                       f"MKN=({m}x{k}x{n})",
+                       bench_kernels=bench_kernels)
         print_timers(timers)
         results.extend(timers)
-
     return results
 
 
-# output makers
 def make_output(data: Iterable[TMeasurement],
                 MKNs: Iterable[Tuple[int, int, int]],
                 base_description: str,
@@ -232,15 +224,11 @@ def make_output(data: Iterable[TMeasurement],
         pkl.dump(data, f)
 
 
-# argparse runners
-
-
 def run_square_bench(args):
     dim_sizes = list(
         range(args.dim_start, args.dim_end + 1, args.dim_increment))
     MKNs = list(zip(dim_sizes, dim_sizes, dim_sizes))
-    data = run(args.dtype, MKNs)
-
+    data = run(args.dtype, MKNs, bench_kernels=args.kernels)
     make_output(data, MKNs, f"square_bench-{args.dtype}")
 
 
@@ -251,8 +239,7 @@ def run_range_bench(args):
     Ks = [args.k_constant] * n if args.k_constant is not None else dim_sizes
     Ns = [args.n_constant] * n if args.n_constant is not None else dim_sizes
     MKNs = list(zip(Ms, Ks, Ns))
-    data = run(args.dtype, MKNs)
-
+    data = run(args.dtype, MKNs, bench_kernels=args.kernels)
     make_output(data, MKNs, f"range_bench-{args.dtype}")
 
 
@@ -278,7 +265,7 @@ def run_model_bench(args):
             for k, n in KNs:
                 MKNs.append((m, k, n))
 
-        data = run(args.dtype, MKNs)
+        data = run(args.dtype, MKNs, bench_kernels=args.kernels)
         model_bench_data.append(data)
 
     # Print all results
@@ -328,6 +315,15 @@ Benchmark Cutlass GEMM.
                         type=to_torch_dtype,
                         required=True,
                         help="Available options are ['int8', 'fp8']")
+    parser.add_argument(
+        "--kernels",
+        nargs="+",
+        type=str,
+        default=None,
+        help=
+        "Exact names of the kernels to benchmark. If not set, runs all kernels."
+    )
+
     subparsers = parser.add_subparsers(dest="cmd")
 
     square_parser = subparsers.add_parser("square_bench")
@@ -362,4 +358,4 @@ Benchmark Cutlass GEMM.
     model_parser.set_defaults(func=run_model_bench)
 
     args = parser.parse_args()
-    args.func(args)
\ No newline at end of file
+    args.func(args)
diff --git a/benchmarks/cutlass_benchmarks/weight_shapes.py b/benchmarks/cutlass_benchmarks/weight_shapes.py
index d58fb0bf86374..3d1121df40d01 100644
--- a/benchmarks/cutlass_benchmarks/weight_shapes.py
+++ b/benchmarks/cutlass_benchmarks/weight_shapes.py
@@ -1,3 +1,5 @@
+# SPDX-License-Identifier: Apache-2.0
+
 # Weight Shapes are in the format
 # ([K, N], TP_SPLIT_DIM)
 # Example:
diff --git a/benchmarks/disagg_benchmarks/disagg_prefill_proxy_server.py b/benchmarks/disagg_benchmarks/disagg_prefill_proxy_server.py
index 4058b1c0a3b79..980e68668911f 100644
--- a/benchmarks/disagg_benchmarks/disagg_prefill_proxy_server.py
+++ b/benchmarks/disagg_benchmarks/disagg_prefill_proxy_server.py
@@ -1,3 +1,5 @@
+# SPDX-License-Identifier: Apache-2.0
+
 import os
 
 import aiohttp
diff --git a/benchmarks/disagg_benchmarks/round_robin_proxy.py b/benchmarks/disagg_benchmarks/round_robin_proxy.py
index 6eb5f63980070..c2ad4916bf077 100644
--- a/benchmarks/disagg_benchmarks/round_robin_proxy.py
+++ b/benchmarks/disagg_benchmarks/round_robin_proxy.py
@@ -1,3 +1,5 @@
+# SPDX-License-Identifier: Apache-2.0
+
 import asyncio
 import itertools
 
diff --git a/benchmarks/disagg_benchmarks/visualize_benchmark_results.py b/benchmarks/disagg_benchmarks/visualize_benchmark_results.py
index e59d8bb0e6c8c..a7b4b9e8bf302 100644
--- a/benchmarks/disagg_benchmarks/visualize_benchmark_results.py
+++ b/benchmarks/disagg_benchmarks/visualize_benchmark_results.py
@@ -1,3 +1,5 @@
+# SPDX-License-Identifier: Apache-2.0
+
 import json
 
 import matplotlib.pyplot as plt
diff --git a/benchmarks/fused_kernels/layernorm_rms_benchmarks.py b/benchmarks/fused_kernels/layernorm_rms_benchmarks.py
index ef91f9f8eb529..c56cc743845e9 100644
--- a/benchmarks/fused_kernels/layernorm_rms_benchmarks.py
+++ b/benchmarks/fused_kernels/layernorm_rms_benchmarks.py
@@ -1,3 +1,5 @@
+# SPDX-License-Identifier: Apache-2.0
+
 import pickle as pkl
 import time
 from dataclasses import dataclass
diff --git a/benchmarks/kernels/benchmark_aqlm.py b/benchmarks/kernels/benchmark_aqlm.py
index 601c4ea439aea..8d20b91560dd6 100644
--- a/benchmarks/kernels/benchmark_aqlm.py
+++ b/benchmarks/kernels/benchmark_aqlm.py
@@ -1,3 +1,5 @@
+# SPDX-License-Identifier: Apache-2.0
+
 import os
 import sys
 from typing import Optional
diff --git a/benchmarks/kernels/benchmark_layernorm.py b/benchmarks/kernels/benchmark_layernorm.py
index 7acea6087fdfd..d265c91bfeffc 100644
--- a/benchmarks/kernels/benchmark_layernorm.py
+++ b/benchmarks/kernels/benchmark_layernorm.py
@@ -1,3 +1,5 @@
+# SPDX-License-Identifier: Apache-2.0
+
 import time
 
 import torch
diff --git a/benchmarks/kernels/benchmark_lora.py b/benchmarks/kernels/benchmark_lora.py
index e1f613e1da509..ecde8fbaa15b8 100644
--- a/benchmarks/kernels/benchmark_lora.py
+++ b/benchmarks/kernels/benchmark_lora.py
@@ -1,3 +1,5 @@
+# SPDX-License-Identifier: Apache-2.0
+
 import argparse
 import copy
 import json
diff --git a/benchmarks/kernels/benchmark_machete.py b/benchmarks/kernels/benchmark_machete.py
index 46bab74ae8adf..0301fee1a8864 100644
--- a/benchmarks/kernels/benchmark_machete.py
+++ b/benchmarks/kernels/benchmark_machete.py
@@ -1,3 +1,5 @@
+# SPDX-License-Identifier: Apache-2.0
+
 import argparse
 import copy
 import itertools
diff --git a/benchmarks/kernels/benchmark_marlin.py b/benchmarks/kernels/benchmark_marlin.py
index 8fb44e3a3dbd8..c22e66c0b0c94 100644
--- a/benchmarks/kernels/benchmark_marlin.py
+++ b/benchmarks/kernels/benchmark_marlin.py
@@ -1,3 +1,5 @@
+# SPDX-License-Identifier: Apache-2.0
+
 from typing import List
 
 import torch
diff --git a/benchmarks/kernels/benchmark_moe.py b/benchmarks/kernels/benchmark_moe.py
index 1fa0da75c79d2..a4a45c9cbff20 100644
--- a/benchmarks/kernels/benchmark_moe.py
+++ b/benchmarks/kernels/benchmark_moe.py
@@ -1,3 +1,5 @@
+# SPDX-License-Identifier: Apache-2.0
+
 import argparse
 import time
 from datetime import datetime
@@ -343,9 +345,13 @@ class BenchmarkWorker:
         op_config = get_moe_configs(num_experts, shard_intermediate_size // 2,
                                     dtype_str)
         if op_config is None:
-            config = get_default_config(num_tokens, num_experts,
-                                        shard_intermediate_size, hidden_size,
-                                        topk, dtype_str)
+            config = get_default_config(num_tokens,
+                                        num_experts,
+                                        shard_intermediate_size,
+                                        hidden_size,
+                                        topk,
+                                        dtype_str,
+                                        is_marlin=False)
         else:
             config = op_config[min(op_config.keys(),
                                    key=lambda x: abs(x - num_tokens))]
@@ -450,7 +456,8 @@ def save_configs(configs: Dict[int, BenchmarkConfig], num_experts: int,
 def main(args: argparse.Namespace):
     print(args)
 
-    config = AutoConfig.from_pretrained(args.model)
+    config = AutoConfig.from_pretrained(
+        args.model, trust_remote_code=args.trust_remote_code)
     if config.architectures[0] == "DbrxForCausalLM":
         E = config.ffn_config.moe_num_experts
         topk = config.ffn_config.moe_top_k
@@ -461,6 +468,11 @@ def main(args: argparse.Namespace):
         topk = config.num_experts_per_tok
         intermediate_size = config.intermediate_size
         shard_intermediate_size = 2 * intermediate_size // args.tp_size
+    elif config.architectures[0] == "DeepseekV3ForCausalLM":
+        E = config.n_routed_experts
+        topk = config.num_experts_per_tok
+        intermediate_size = config.moe_intermediate_size
+        shard_intermediate_size = 2 * intermediate_size // args.tp_size
     else:
         # Default: Mixtral.
         E = config.num_local_experts
@@ -530,7 +542,11 @@ if __name__ == "__main__":
     parser.add_argument("--model",
                         type=str,
                         default="mistralai/Mixtral-8x7B-Instruct-v0.1")
-    parser.add_argument("--tp-size", "-tp", type=int, default=2)
+    parser.add_argument("--tp-size",
+                        "-tp",
+                        "--tensor-parallel-size",
+                        type=int,
+                        default=2)
     parser.add_argument("--dtype",
                         type=str,
                         choices=["auto", "fp8_w8a8", "int8_w8a16"],
@@ -538,6 +554,7 @@ if __name__ == "__main__":
     parser.add_argument("--seed", type=int, default=0)
     parser.add_argument("--batch-size", type=int, required=False)
     parser.add_argument("--tune", action="store_true")
+    parser.add_argument("--trust-remote-code", action="store_true")
     args = parser.parse_args()
 
     main(args)
diff --git a/benchmarks/kernels/benchmark_paged_attention.py b/benchmarks/kernels/benchmark_paged_attention.py
index 219013a38134b..daedaadb1a77b 100644
--- a/benchmarks/kernels/benchmark_paged_attention.py
+++ b/benchmarks/kernels/benchmark_paged_attention.py
@@ -1,3 +1,5 @@
+# SPDX-License-Identifier: Apache-2.0
+
 import random
 import time
 from typing import List, Optional
diff --git a/benchmarks/kernels/benchmark_quant.py b/benchmarks/kernels/benchmark_quant.py
index 1d62483448946..0ddea9390d777 100644
--- a/benchmarks/kernels/benchmark_quant.py
+++ b/benchmarks/kernels/benchmark_quant.py
@@ -1,3 +1,5 @@
+# SPDX-License-Identifier: Apache-2.0
+
 import time
 
 import torch
diff --git a/benchmarks/kernels/benchmark_rmsnorm.py b/benchmarks/kernels/benchmark_rmsnorm.py
index baa5de0fff1bd..dba153742da4f 100644
--- a/benchmarks/kernels/benchmark_rmsnorm.py
+++ b/benchmarks/kernels/benchmark_rmsnorm.py
@@ -1,3 +1,5 @@
+# SPDX-License-Identifier: Apache-2.0
+
 import itertools
 from typing import Optional, Tuple, Union
 
diff --git a/benchmarks/kernels/benchmark_rope.py b/benchmarks/kernels/benchmark_rope.py
index 250d505168d09..8ee0212a0c11d 100644
--- a/benchmarks/kernels/benchmark_rope.py
+++ b/benchmarks/kernels/benchmark_rope.py
@@ -1,3 +1,5 @@
+# SPDX-License-Identifier: Apache-2.0
+
 from itertools import accumulate
 from typing import List, Optional
 
diff --git a/benchmarks/kernels/benchmark_shapes.py b/benchmarks/kernels/benchmark_shapes.py
index 4eeeca35a37cc..c375e61e41873 100644
--- a/benchmarks/kernels/benchmark_shapes.py
+++ b/benchmarks/kernels/benchmark_shapes.py
@@ -1,3 +1,5 @@
+# SPDX-License-Identifier: Apache-2.0
+
 WEIGHT_SHAPES = {
     "ideal": [[4 * 256 * 32, 256 * 32]],
     "mistralai/Mistral-7B-v0.1/TP1": [
diff --git a/benchmarks/kernels/graph_machete_bench.py b/benchmarks/kernels/graph_machete_bench.py
index 7d0bd84150a27..01d97d63d7cf0 100644
--- a/benchmarks/kernels/graph_machete_bench.py
+++ b/benchmarks/kernels/graph_machete_bench.py
@@ -1,3 +1,5 @@
+# SPDX-License-Identifier: Apache-2.0
+
 import math
 import pickle
 import re
diff --git a/benchmarks/kernels/utils.py b/benchmarks/kernels/utils.py
index fee877b6f76fa..7281707484921 100644
--- a/benchmarks/kernels/utils.py
+++ b/benchmarks/kernels/utils.py
@@ -1,3 +1,5 @@
+# SPDX-License-Identifier: Apache-2.0
+
 import dataclasses
 from typing import Any, Callable, Iterable, Optional
 
diff --git a/benchmarks/kernels/weight_shapes.py b/benchmarks/kernels/weight_shapes.py
index 51f24f3ba1774..89b05d5882a38 100644
--- a/benchmarks/kernels/weight_shapes.py
+++ b/benchmarks/kernels/weight_shapes.py
@@ -1,3 +1,5 @@
+# SPDX-License-Identifier: Apache-2.0
+
 # Weight Shapes are in the format
 # ([K, N], TP_SPLIT_DIM)
 # Example:
diff --git a/benchmarks/overheads/benchmark_hashing.py b/benchmarks/overheads/benchmark_hashing.py
index d16d6f9fba442..5f94552e9dc85 100644
--- a/benchmarks/overheads/benchmark_hashing.py
+++ b/benchmarks/overheads/benchmark_hashing.py
@@ -1,3 +1,5 @@
+# SPDX-License-Identifier: Apache-2.0
+
 import cProfile
 import pstats
 
diff --git a/cmake/hipify.py b/cmake/hipify.py
index 340e41c8179e3..2e0c8a172410f 100755
--- a/cmake/hipify.py
+++ b/cmake/hipify.py
@@ -1,3 +1,5 @@
+# SPDX-License-Identifier: Apache-2.0
+
 #!/usr/bin/env python3
 
 #
diff --git a/cmake/utils.cmake b/cmake/utils.cmake
index 15b09395a889f..1c1c539819d05 100644
--- a/cmake/utils.cmake
+++ b/cmake/utils.cmake
@@ -259,7 +259,7 @@ endmacro()
 #  in `SRC_CUDA_ARCHS` that is less or equal to the version in `TGT_CUDA_ARCHS`.
 # We have special handling for 9.0a, if 9.0a is in `SRC_CUDA_ARCHS` and 9.0 is
 #  in `TGT_CUDA_ARCHS` then we should remove 9.0a from `SRC_CUDA_ARCHS` and add
-#  9.0a to the result. 
+#  9.0a to the result (and remove 9.0 from TGT_CUDA_ARCHS). 
 # The result is stored in `OUT_CUDA_ARCHS`.
 #
 # Example:
@@ -270,34 +270,47 @@ endmacro()
 #
 function(cuda_archs_loose_intersection OUT_CUDA_ARCHS SRC_CUDA_ARCHS TGT_CUDA_ARCHS)
   list(REMOVE_DUPLICATES SRC_CUDA_ARCHS)
+  set(TGT_CUDA_ARCHS_ ${TGT_CUDA_ARCHS})
 
   # if 9.0a is in SRC_CUDA_ARCHS and 9.0 is in CUDA_ARCHS then we should
   # remove 9.0a from SRC_CUDA_ARCHS and add 9.0a to _CUDA_ARCHS
   set(_CUDA_ARCHS)
   if ("9.0a" IN_LIST SRC_CUDA_ARCHS)
     list(REMOVE_ITEM SRC_CUDA_ARCHS "9.0a")
-    if ("9.0" IN_LIST TGT_CUDA_ARCHS)
+    if ("9.0" IN_LIST TGT_CUDA_ARCHS_)
+      list(REMOVE_ITEM TGT_CUDA_ARCHS_ "9.0")
       set(_CUDA_ARCHS "9.0a")
     endif()
   endif()
 
   list(SORT SRC_CUDA_ARCHS COMPARE NATURAL ORDER ASCENDING)
 
-  # for each ARCH in CUDA_ARCHS find the highest arch in SRC_CUDA_ARCHS that is 
-  # less or eqault to ARCH
-  foreach(_ARCH ${CUDA_ARCHS})
-  set(_TMP_ARCH)
-  foreach(_SRC_ARCH ${SRC_CUDA_ARCHS})
-    if (_SRC_ARCH VERSION_LESS_EQUAL _ARCH)
-      set(_TMP_ARCH ${_SRC_ARCH})
-    else()
-      break()
+  # for each ARCH in TGT_CUDA_ARCHS find the highest arch in SRC_CUDA_ARCHS that
+  # is less or equal to ARCH (but has the same major version since SASS binary
+  # compatibility is only forward compatible within the same major version).
+  foreach(_ARCH ${TGT_CUDA_ARCHS_})
+    set(_TMP_ARCH)
+    # Extract the major version of the target arch
+    string(REGEX REPLACE "^([0-9]+)\\..*$" "\\1" TGT_ARCH_MAJOR "${_ARCH}")
+    foreach(_SRC_ARCH ${SRC_CUDA_ARCHS})
+      # Extract the major version of the source arch
+      string(REGEX REPLACE "^([0-9]+)\\..*$" "\\1" SRC_ARCH_MAJOR "${_SRC_ARCH}")
+      # Check major-version match AND version-less-or-equal
+      if (_SRC_ARCH VERSION_LESS_EQUAL _ARCH)
+        if (SRC_ARCH_MAJOR STREQUAL TGT_ARCH_MAJOR)
+          set(_TMP_ARCH "${_SRC_ARCH}")
+        endif()
+      else()
+        # If we hit a version greater than the target, we can break
+        break()
+      endif()
+    endforeach()
+
+    # If we found a matching _TMP_ARCH, append it to _CUDA_ARCHS
+    if (_TMP_ARCH)
+      list(APPEND _CUDA_ARCHS "${_TMP_ARCH}")
     endif()
   endforeach()
-  if (_TMP_ARCH)
-    list(APPEND _CUDA_ARCHS ${_TMP_ARCH})
-  endif()
-  endforeach()
 
   list(REMOVE_DUPLICATES _CUDA_ARCHS)
   set(${OUT_CUDA_ARCHS} ${_CUDA_ARCHS} PARENT_SCOPE)
diff --git a/collect_env.py b/collect_env.py
index 254c19b19a5ac..0ec9d4cae4ba7 100644
--- a/collect_env.py
+++ b/collect_env.py
@@ -1,3 +1,5 @@
+# SPDX-License-Identifier: Apache-2.0
+
 # ruff: noqa
 # code borrowed from https://github.com/pytorch/pytorch/blob/main/torch/utils/collect_env.py
 
diff --git a/csrc/cache.h b/csrc/cache.h
index dcfea6a82360f..f1bd802b80eb6 100644
--- a/csrc/cache.h
+++ b/csrc/cache.h
@@ -37,6 +37,11 @@ void reshape_and_cache_flash_full_cuda(
         const std::string& kv_cache_dtype,
         const double k_scale, const double v_scale);
 
+void concat_and_cache_mla(torch::Tensor& kv_c, torch::Tensor& k_pe,
+                          torch::Tensor& kv_cache, torch::Tensor& slot_mapping,
+                          const std::string& kv_cache_dtype,
+                          torch::Tensor& scale);
+
 // Just for unittest
 void convert_fp8(torch::Tensor& dst_cache, torch::Tensor& src_cache,
                  const double scale, const std::string& kv_cache_dtype);
diff --git a/csrc/cache_kernels.cu b/csrc/cache_kernels.cu
index a63b6f6b3c307..f660c0c39275b 100644
--- a/csrc/cache_kernels.cu
+++ b/csrc/cache_kernels.cu
@@ -291,6 +291,51 @@ __global__ void reshape_and_cache_flash_full_cuda_kernel(
     }
   }
 }
+
+template <typename scalar_t, typename cache_t, Fp8KVCacheDataType kv_dt>
+__global__ void concat_and_cache_mla_kernel(
+    const scalar_t* __restrict__ kv_c,  // [num_tokens, kv_lora_rank]
+    const scalar_t* __restrict__ k_pe,  // [num_tokens, pe_dim]
+    cache_t* __restrict__ kv_cache,  // [num_blocks, block_size, (kv_lora_rank
+                                     // + pe_dim)]
+    const int64_t* __restrict__ slot_mapping,  // [num_tokens]
+    const int block_stride,                    //
+    const int kv_c_stride,                     //
+    const int k_pe_stride,                     //
+    const int kv_lora_rank,                    //
+    const int pe_dim,                          //
+    const int block_size,                      //
+    const float* scale                         //
+) {
+  const int64_t token_idx = blockIdx.x;
+  const int64_t slot_idx = slot_mapping[token_idx];
+  // NOTE: slot_idx can be -1 if the token is padded
+  if (slot_idx < 0) {
+    return;
+  }
+  const int64_t block_idx = slot_idx / block_size;
+  const int64_t block_offset = slot_idx % block_size;
+
+  auto copy = [&](const scalar_t* __restrict__ src, cache_t* __restrict__ dst,
+                  int src_stride, int dst_stride, int size, int offset) {
+    for (int i = threadIdx.x; i < size; i += blockDim.x) {
+      const int64_t src_idx = token_idx * src_stride + i;
+      const int64_t dst_idx = block_idx * block_stride +
+                              block_offset * (kv_lora_rank + pe_dim) + i +
+                              offset;
+      if constexpr (kv_dt == Fp8KVCacheDataType::kAuto) {
+        dst[dst_idx] = src[src_idx];
+      } else {
+        dst[dst_idx] =
+            fp8::scaled_convert<cache_t, scalar_t, kv_dt>(src[src_idx], *scale);
+      }
+    }
+  };
+
+  copy(kv_c, kv_cache, kv_c_stride, block_stride, kv_lora_rank, 0);
+  copy(k_pe, kv_cache, k_pe_stride, block_stride, pe_dim, kv_lora_rank);
+}
+
 }  // namespace vllm
 
 // KV_T is the stored data type of kv-cache.
@@ -432,6 +477,53 @@ void reshape_and_cache_flash_full_cuda(
                              CALL_RESHAPE_AND_CACHE_FLASH_FULL_CUDA);
 }
 
+#define CALL_CONCAT_AND_CACHE_MLA(KV_T, CACHE_T, KV_DTYPE)             \
+  vllm::concat_and_cache_mla_kernel<KV_T, CACHE_T, KV_DTYPE>           \
+      <<<grid, block, 0, stream>>>(                                    \
+          reinterpret_cast<KV_T*>(kv_c.data_ptr()),                    \
+          reinterpret_cast<KV_T*>(k_pe.data_ptr()),                    \
+          reinterpret_cast<CACHE_T*>(kv_cache.data_ptr()),             \
+          slot_mapping.data_ptr<int64_t>(), block_stride, kv_c_stride, \
+          k_pe_stride, kv_lora_rank, pe_dim, block_size,               \
+          reinterpret_cast<const float*>(scale.data_ptr()));
+
+void concat_and_cache_mla(
+    torch::Tensor& kv_c,          // [num_tokens, kv_lora_rank]
+    torch::Tensor& k_pe,          // [num_tokens, pe_dim]
+    torch::Tensor& kv_cache,      // [num_blocks, block_size, (kv_lora_rank +
+                                  // pe_dim)]
+    torch::Tensor& slot_mapping,  // [num_tokens] or [num_actual_tokens]
+    const std::string& kv_cache_dtype, torch::Tensor& scale) {
+  // NOTE(woosuk): In vLLM V1, key.size(0) can be different from
+  // slot_mapping.size(0) because of padding for CUDA graphs.
+  // In vLLM V0, key.size(0) is always equal to slot_mapping.size(0) because
+  // both include padding.
+  // In vLLM V1, however, key.size(0) can be larger than slot_mapping.size(0)
+  // since key includes padding for CUDA graphs, while slot_mapping does not.
+  // In this case, slot_mapping.size(0) represents the actual number of tokens
+  // before padding.
+  // For compatibility with both cases, we use slot_mapping.size(0) as the
+  // number of tokens.
+  int num_tokens = slot_mapping.size(0);
+  int kv_lora_rank = kv_c.size(1);
+  int pe_dim = k_pe.size(1);
+  int block_size = kv_cache.size(1);
+
+  TORCH_CHECK(kv_cache.size(2) == kv_lora_rank + pe_dim);
+
+  int kv_c_stride = kv_c.stride(0);
+  int k_pe_stride = k_pe.stride(0);
+  int block_stride = kv_cache.stride(0);
+
+  dim3 grid(num_tokens);
+  dim3 block(std::min(kv_lora_rank, 512));
+  const at::cuda::OptionalCUDAGuard device_guard(device_of(kv_c));
+  const cudaStream_t stream = at::cuda::getCurrentCUDAStream();
+
+  DISPATCH_BY_KV_CACHE_DTYPE(kv_c.dtype(), kv_cache_dtype,
+                             CALL_CONCAT_AND_CACHE_MLA);
+}
+
 namespace vllm {
 
 template <typename Tout, typename Tin, Fp8KVCacheDataType kv_dt>
diff --git a/csrc/core/math.hpp b/csrc/core/math.hpp
index ba9f40a230c8e..ddfaca27147b4 100644
--- a/csrc/core/math.hpp
+++ b/csrc/core/math.hpp
@@ -1,7 +1,14 @@
+#pragma once
+
 #include <climits>
 #include <iostream>
 
-inline uint32_t next_pow_2(uint32_t const num) {
+inline constexpr uint32_t next_pow_2(uint32_t const num) {
   if (num <= 1) return num;
   return 1 << (CHAR_BIT * sizeof(num) - __builtin_clz(num - 1));
+}
+
+template <typename T>
+inline constexpr std::enable_if_t<std::is_integral_v<T>, T> ceil_div(T a, T b) {
+  return (a + b - 1) / b;
 }
\ No newline at end of file
diff --git a/csrc/custom_all_reduce.cuh b/csrc/custom_all_reduce.cuh
index 6be4d4f2b2eb8..b9df4ed160b03 100644
--- a/csrc/custom_all_reduce.cuh
+++ b/csrc/custom_all_reduce.cuh
@@ -38,9 +38,13 @@ struct Signal {
   alignas(128) FlagType peer_counter[2][kMaxBlocks][8];
 };
 
-struct __align__(16) RankData { const void* __restrict__ ptrs[8]; };
+struct __align__(16) RankData {
+  const void* __restrict__ ptrs[8];
+};
 
-struct __align__(16) RankSignals { Signal* signals[8]; };
+struct __align__(16) RankSignals {
+  Signal* signals[8];
+};
 
 // like std::array, but aligned
 template <typename T, int sz>
diff --git a/csrc/cutlass_extensions/common.hpp b/csrc/cutlass_extensions/common.hpp
index 07c9e46c27b06..febc4eccd9561 100644
--- a/csrc/cutlass_extensions/common.hpp
+++ b/csrc/cutlass_extensions/common.hpp
@@ -32,3 +32,20 @@ inline int get_cuda_max_shared_memory_per_block_opt_in(int const device) {
 }
 
 int32_t get_sm_version_num();
+
+/**
+ * A wrapper for a kernel that is used to guard against compilation on
+ * architectures that will never use the kernel. The purpose of this is to
+ * reduce the size of the compiled binary.
+ * __CUDA_ARCH__ is not defined in host code, so this lets us smuggle the ifdef
+ * into code that will be executed on the device where it is defined.
+ */
+template <typename Kernel>
+struct enable_sm90_or_later : Kernel {
+  template <typename... Args>
+  CUTLASS_DEVICE void operator()(Args&&... args) {
+#if defined __CUDA_ARCH__ && __CUDA_ARCH__ >= 900
+    Kernel::operator()(std::forward<Args>(args)...);
+#endif
+  }
+};
\ No newline at end of file
diff --git a/csrc/cutlass_extensions/gemm/collective/collective_builder.hpp b/csrc/cutlass_extensions/gemm/collective/collective_builder.hpp
new file mode 100644
index 0000000000000..ec75c29e54f4d
--- /dev/null
+++ b/csrc/cutlass_extensions/gemm/collective/collective_builder.hpp
@@ -0,0 +1,123 @@
+// Modified from: cutlass/gemm/collective/builders/sm90_gmma_builder.inl
+// clang-format off
+#pragma once
+
+#include "cutlass/gemm/collective/builders/sm90_gmma_builder.inl"
+
+#include "cutlass_extensions/gemm/collective/sm90_mma_tma_gmma_ss_warpspecialized_fp8_blockwise_scaling.hpp"
+
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass::gemm::collective {
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA_TMA_WS_SS (BlockScaled Builders)
+template <
+  class ElementA,
+  class GmemLayoutATag,
+  int AlignmentA,
+  class ElementB,
+  class GmemLayoutBTag,
+  int AlignmentB,
+  class ElementAccumulator,
+  class TileShape_MNK,
+  class ClusterShape_MNK,
+  class StageCountType,
+  int ScaleGranularityM
+>
+struct CollectiveBuilder<
+    arch::Sm90,
+    arch::OpClassTensorOp,
+    ElementA,
+    GmemLayoutATag,
+    AlignmentA,
+    ElementB,
+    GmemLayoutBTag,
+    AlignmentB,
+    ElementAccumulator,
+    TileShape_MNK,
+    ClusterShape_MNK,
+    StageCountType,
+    KernelTmaWarpSpecializedCooperativeFP8BlockScaledSubGroupMAccum<ScaleGranularityM>,
+    cute::enable_if_t<
+      not detail::is_use_rmem_A<ElementA, GmemLayoutATag, ElementB, GmemLayoutBTag>()>
+> {
+  using KernelScheduleType = KernelTmaWarpSpecializedCooperativeFP8BlockScaledSubGroupMAccum<ScaleGranularityM>;
+
+  static_assert(is_static<TileShape_MNK>::value);
+  static_assert(is_static<ClusterShape_MNK>::value);
+#ifndef CUTLASS_SM90_COLLECTIVE_BUILDER_SUPPORTED
+  static_assert(cutlass::detail::dependent_false<ElementA>, "Unsupported Toolkit for SM90 Collective Builder\n");
+#endif
+  static_assert(detail::is_aligned<ElementA, AlignmentA, ElementB, AlignmentB, detail::tma_alignment_bytes>(),
+                "Should meet TMA alignment requirement\n");
+
+  static constexpr bool IsArrayOfPointersGemm = (cute::is_any_of_v<KernelScheduleType,
+                                                                   KernelPtrArrayTmaWarpSpecializedCooperative,
+                                                                   KernelPtrArrayTmaWarpSpecializedPingpong>);
+  static constexpr bool IsFP8Input = detail::is_input_fp8<ElementA, ElementB>();
+  static_assert((!IsFP8Input || !IsArrayOfPointersGemm),
+                "KernelTmaWarpSpecializedCooperativeFP8BlockScaledAccum is only compatible with FP8 Blocked Scaled version right now.");
+
+  // For fp32 types, map to tf32 MMA value type
+  using ElementAMma = cute::conditional_t<cute::is_same_v<ElementA, float>, tfloat32_t, ElementA>;
+  using ElementBMma = cute::conditional_t<cute::is_same_v<ElementB, float>, tfloat32_t, ElementB>;
+
+  static constexpr cute::GMMA::Major GmmaMajorA = detail::gmma_ss_tag_to_major_A<ElementAMma, GmemLayoutATag>();
+  static constexpr cute::GMMA::Major GmmaMajorB = detail::gmma_ss_tag_to_major_B<ElementBMma, GmemLayoutBTag>();
+
+  static constexpr bool IsCooperative = cute::is_any_of_v<KernelScheduleType,
+                                                          KernelTmaWarpSpecializedCooperative,
+                                                          KernelPtrArrayTmaWarpSpecializedCooperative,
+                                                          KernelTmaWarpSpecializedCooperativeFP8BlockScaledSubGroupMAccum<ScaleGranularityM>>;
+  using AtomLayoutMNK = cute::conditional_t<IsCooperative,
+      Layout<Shape<_2,_1,_1>>, Layout<Shape<_1,_1,_1>>>;
+
+  using TiledMma = decltype(cute::make_tiled_mma(cute::GMMA::ss_op_selector<
+      ElementAMma, ElementBMma, ElementAccumulator, TileShape_MNK, GmmaMajorA, GmmaMajorB>(), AtomLayoutMNK{}));
+
+  using GmemTiledCopyA = decltype(detail::sm90_cluster_shape_to_tma_atom(shape<1>(ClusterShape_MNK{})));
+  using GmemTiledCopyB = decltype(detail::sm90_cluster_shape_to_tma_atom(shape<0>(ClusterShape_MNK{})));
+
+  using SmemLayoutAtomA = decltype(detail::ss_smem_selector<
+      GmmaMajorA, ElementAMma, decltype(cute::get<0>(TileShape_MNK{})), decltype(cute::get<2>(TileShape_MNK{}))>());
+  using SmemLayoutAtomB = decltype(detail::ss_smem_selector<
+      GmmaMajorB, ElementBMma, decltype(cute::get<1>(TileShape_MNK{})), decltype(cute::get<2>(TileShape_MNK{}))>());
+
+  static constexpr size_t TensorMapStorage = IsArrayOfPointersGemm ? sizeof(cute::TmaDescriptor) * 2 /* for A and B */ : 0;
+  static constexpr int KernelSmemCarveout = static_cast<int>(TensorMapStorage);
+
+  static constexpr int PipelineStages = detail::compute_stage_count_or_override<detail::sm90_smem_capacity_bytes - KernelSmemCarveout,
+      ElementAMma, ElementBMma, TileShape_MNK>(StageCountType{});
+  using DispatchPolicy = MainloopSm90TmaGmmaWarpSpecializedBlockScalingSubGroupMFP8<PipelineStages, ClusterShape_MNK, KernelScheduleType, ScaleGranularityM>;
+
+  using SmemCopyAtomA = void;
+  using SmemCopyAtomB = void;
+
+  using CollectiveOp = CollectiveMma<
+      DispatchPolicy,
+      TileShape_MNK,
+      ElementA,
+      TagToStrideA_t<GmemLayoutATag>,
+      ElementB,
+      TagToStrideB_t<GmemLayoutBTag>,
+      TiledMma,
+      GmemTiledCopyA,
+      SmemLayoutAtomA,
+      SmemCopyAtomA,
+      cute::identity,
+      GmemTiledCopyB,
+      SmemLayoutAtomB,
+      SmemCopyAtomB,
+      cute::identity
+    >;
+};
+
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+}  // namespace cutlass::gemm::collective
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
diff --git a/csrc/cutlass_extensions/gemm/collective/fp8_accumulation.hpp b/csrc/cutlass_extensions/gemm/collective/fp8_accumulation.hpp
new file mode 100644
index 0000000000000..13b90e998625e
--- /dev/null
+++ b/csrc/cutlass_extensions/gemm/collective/fp8_accumulation.hpp
@@ -0,0 +1,183 @@
+// clang-format off
+// adapted from: https://github.com/soundOfDestiny/cutlass/blob/a4208aa6958864923505cade9c63eb2a6daf16e5/include/cutlass/gemm/collective/fp8_accumulation.hpp
+
+/***************************************************************************************************
+ * Copyright (c) 2023 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+
+#pragma once
+
+#include "cute/algorithm/clear.hpp"
+#include "cute/tensor.hpp"
+
+//////////////////////////////////////////////////////////////////////////////
+///////////////////////////////////FP8 Accumulation///////////////////////////
+//////////////////////////////////////////////////////////////////////////////
+/// This class provides API to promote (add) or scale (multiply_add) the results
+/// from the tensor core accumulators to the main accumulators when the number 
+/// of MMAs reaches the max number of MMA interval specified by user, after that
+/// the tensor core accumulators are zeroed.
+//////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass::gemm::collective {
+
+template <
+    class EngineAccum,
+    class LayoutAccum>
+struct GmmaFP8AccumulationWithScale {  
+  using TensorAccum = cute::Tensor<EngineAccum, LayoutAccum>;
+  using ElementAccumulator = typename EngineAccum::value_type;
+
+  static_assert(is_static<LayoutAccum>::value, "Accumulator Layout should be static");
+  static_assert(is_rmem<TensorAccum>::value , "Accumulator tensor must be rmem resident.");
+
+private:
+  TensorAccum& accum_;
+  TensorAccum accum_temp_;
+
+  uint32_t accum_promotion_interval_;         // defines the max num of executed MMAs after which accum should be promoted.
+  uint32_t mma_count_per_mainloop_iteration_; // num of MMAs per k_tile of mainloop
+  uint32_t mma_count_;                        // current executed MMAs
+  uint32_t reset_accum_flag_;                 // accum needs to be zeroed or not. 
+
+  // promote or `add` the partial accumulators to main accumulator (FADD).
+  CUTLASS_DEVICE
+  void promote_core() {
+    warpgroup_wait<0>();
+    CUTLASS_PRAGMA_UNROLL
+    for (int i = 0; i < size(accum_); ++i) {
+      accum_(i) += accum_temp_(i);
+    }
+  }
+
+  // `multiply` scale the partial accumulators and `add` to main accumulator (FFMA).
+  template <
+    class EngineScale,
+    class LayoutScale>
+  CUTLASS_DEVICE
+  void scale_core(const cute::Tensor<EngineScale, LayoutScale> &scale) {
+    using TensorScale = cute::Tensor<EngineScale, LayoutScale>;
+
+    static_assert(is_static<LayoutScale>::value, "Scale Layout should be static");
+    static_assert(is_rmem<TensorScale>::value , "Scale tensor must be rmem resident.");
+
+    static_assert(LayoutAccum{}.shape() == LayoutScale{}.shape(), "Accumulator and scale must have same shape.");
+
+    warpgroup_wait<0>();
+    CUTLASS_PRAGMA_UNROLL
+    for (int i = 0; i < size(accum_); ++i) {
+      accum_(i) += accum_temp_(i) * scale(i);
+    }
+  }
+
+public:
+  CUTLASS_DEVICE
+  GmmaFP8AccumulationWithScale(
+      TensorAccum &accum,
+      uint32_t accum_promotion_interval,
+      uint32_t mma_count_per_mainloop_iteration)
+      : accum_(accum), 
+        accum_promotion_interval_(accum_promotion_interval),
+        mma_count_per_mainloop_iteration_(mma_count_per_mainloop_iteration),
+        mma_count_(0), 
+        reset_accum_flag_(0) 
+  {
+    accum_temp_ = cute::make_fragment_like(accum);
+  }
+
+  //
+  // Methods (Common)
+  //
+
+  CUTLASS_DEVICE 
+  TensorAccum& operator()() {
+    return accum_temp_;
+  }
+
+  /// prepare the MMA accumulators when initialization or zeroing is required.
+  CUTLASS_DEVICE
+  bool prepare_if_needed() { 
+    return reset_accum_flag_;
+  }
+
+  //
+  // Methods (for FADD version)
+  //
+
+  /// promote (add) the results from the MMA accumulators to main accumulator if needed.
+  CUTLASS_DEVICE
+  void promote_if_needed() {
+    mma_count_ += mma_count_per_mainloop_iteration_;
+    reset_accum_flag_ = __shfl_sync(0xffffffff, mma_count_ == accum_promotion_interval_, 0);
+    if (reset_accum_flag_) {
+      promote_core();
+      mma_count_ = 0;
+    }
+  }
+
+  /// promote (add) the residue results from the MMA accumulators to main accumulator if needed.
+  CUTLASS_DEVICE
+  void promote_residue_if_needed() {
+    if (__shfl_sync(0xffffffff, mma_count_ > 0, 0)) {
+      promote_core();
+    }
+  }
+
+  //
+  // Methods (for FFMA version)
+  //
+
+  /// scale (multiply_add) the results from the MMA accumulators to main accumulator if needed.
+  template <
+    class EngineScale,
+    class LayoutScale>
+  CUTLASS_DEVICE
+  void scale_if_needed(const cute::Tensor<EngineScale, LayoutScale> &scale) {
+    mma_count_ += mma_count_per_mainloop_iteration_;
+    reset_accum_flag_ = __shfl_sync(0xffffffff, mma_count_ == accum_promotion_interval_, 0);
+    if (reset_accum_flag_) {
+      scale_core(scale);
+      mma_count_ = 0;
+    }
+  }
+
+  /// scale (multiply_add) the residue results from the MMA accumulators to main accumulator if needed.
+  template <
+    class EngineScale,
+    class LayoutScale>
+  CUTLASS_DEVICE
+  void scale_residue_if_needed(const cute::Tensor<EngineScale, LayoutScale> &scale) {
+    if (__shfl_sync(0xffffffff, mma_count_ > 0, 0)) {
+      scale_core(scale);
+    }
+  }
+};
+
+} // namespace cutlass::gemm::collective
diff --git a/csrc/cutlass_extensions/gemm/collective/sm90_mma_tma_gmma_ss_warpspecialized_fp8_blockwise_scaling.hpp b/csrc/cutlass_extensions/gemm/collective/sm90_mma_tma_gmma_ss_warpspecialized_fp8_blockwise_scaling.hpp
new file mode 100644
index 0000000000000..928a9500cbb08
--- /dev/null
+++ b/csrc/cutlass_extensions/gemm/collective/sm90_mma_tma_gmma_ss_warpspecialized_fp8_blockwise_scaling.hpp
@@ -0,0 +1,730 @@
+// clang-format off
+// Adapted (Heavily) from: https://github.com/soundOfDestiny/cutlass/blob/9d997ce0dea4c5fa1a617db6b7ff29aa9235822c/include/cutlass/gemm/collective/sm90_mma_tma_gmma_ss_warpspecialized_fp8_blockwise_scaling.hpp
+
+/***************************************************************************************************
+ * Copyright (c) 2023 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+
+#pragma once
+
+#include "cutlass/cutlass.h"
+#include "cutlass/gemm/dispatch_policy.hpp"
+#include "cutlass/trace.h"
+#include "cutlass/numeric_types.h"
+
+#include "cute/arch/cluster_sm90.hpp"
+#include "cute/arch/copy_sm80.hpp"
+#include "cute/arch/copy_sm90.hpp"
+#include "cute/algorithm/functional.hpp"
+#include "cute/atom/mma_atom.hpp"
+#include "cute/algorithm/gemm.hpp"
+#include "cute/tensor_predicate.hpp"
+#include "cute/numeric/arithmetic_tuple.hpp"
+
+#include "cutlass_extensions/gemm/dispatch_policy.hpp"
+#include "cutlass_extensions/gemm/collective/fp8_accumulation.hpp"
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass::gemm::collective {
+using namespace cute;
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+// WarpSpecialized Mainloop
+template <
+  int Stages,
+  class ClusterShape,
+  class KernelSchedule,
+  int ScaleGranularityM_,
+  class TileShape_,
+  class ElementA_,
+  class StrideA_,
+  class ElementB_,
+  class StrideB_,
+  class TiledMma_,
+  class GmemTiledCopyA_,
+  class SmemLayoutAtomA_,
+  class SmemCopyAtomA_,
+  class TransformA_,
+  class GmemTiledCopyB_,
+  class SmemLayoutAtomB_,
+  class SmemCopyAtomB_,
+  class TransformB_>
+struct CollectiveMma<
+    MainloopSm90TmaGmmaWarpSpecializedBlockScalingSubGroupMFP8<Stages, ClusterShape, KernelSchedule, ScaleGranularityM_>,
+    TileShape_,
+    ElementA_,
+    StrideA_,
+    ElementB_,
+    StrideB_,
+    TiledMma_,
+    GmemTiledCopyA_,
+    SmemLayoutAtomA_,
+    SmemCopyAtomA_,
+    TransformA_,
+    GmemTiledCopyB_,
+    SmemLayoutAtomB_,
+    SmemCopyAtomB_,
+    TransformB_>
+{
+  //
+  // Type Aliases
+  //
+  using DispatchPolicy = MainloopSm90TmaGmmaWarpSpecializedBlockScalingSubGroupMFP8<Stages, ClusterShape, KernelSchedule, ScaleGranularityM_>;
+  using TileShape = TileShape_;
+  using ElementA = ElementA_;
+  using StrideA = StrideA_;
+  using ElementB = ElementB_;
+  using StrideB = StrideB_;
+  using TiledMma = TiledMma_;
+  using ElementAccumulator = typename TiledMma::ValTypeC;
+  using ElementBlockScale = ElementAccumulator;
+  using GmemTiledCopyA = GmemTiledCopyA_;
+  using GmemTiledCopyB = GmemTiledCopyB_;
+  using SmemLayoutAtomA = SmemLayoutAtomA_;
+  using SmemLayoutAtomB = SmemLayoutAtomB_;
+  using SmemCopyAtomA = SmemCopyAtomA_;
+  using SmemCopyAtomB = SmemCopyAtomB_;
+  using TransformA = TransformA_;
+  using TransformB = TransformB_;
+  using ArchTag = typename DispatchPolicy::ArchTag;
+
+  using CtaShape_MNK = decltype(shape_div(TileShape{}, ClusterShape{}));
+  using MainloopPipeline = cutlass::PipelineTmaAsync<DispatchPolicy::Stages>;
+  using PipelineState = cutlass::PipelineState<DispatchPolicy::Stages>;
+  using PipelineParams = typename MainloopPipeline::Params;
+
+  // Two threads per CTA are producers (1 for operand tile and 32 for scales)
+  static constexpr int NumProducerThreadEvents = 33; 
+
+  static constexpr int ScaleGranularityM = ScaleGranularityM_ == 0 ? size<0>(TileShape{}) : ScaleGranularityM_;
+  static constexpr int ScaleMsPerTile = size<0>(TileShape{}) / ScaleGranularityM;
+
+  static_assert(cute::rank(SmemLayoutAtomA{}) == 2, "SmemLayoutAtom must be rank 2 (M/N, K)");
+  static_assert((size<0>(TileShape{}) % size<0>(SmemLayoutAtomA{})) == 0, "SmemLayoutAtom must evenly divide tile shape.");
+  static_assert((size<2>(TileShape{}) % size<1>(SmemLayoutAtomA{})) == 0, "SmemLayoutAtom must evenly divide tile shape.");
+
+  static_assert(cute::rank(SmemLayoutAtomB{}) == 2, "SmemLayoutAtom must be rank 2 (M/N, K)");
+  static_assert((size<1>(TileShape{}) % size<0>(SmemLayoutAtomB{})) == 0, "SmemLayoutAtom must evenly divide tile shape.");
+  static_assert((size<2>(TileShape{}) % size<1>(SmemLayoutAtomB{})) == 0, "SmemLayoutAtom must evenly divide tile shape.");
+
+  static_assert((size<0>(TileShape{}) % ScaleGranularityM) == 0, "FP8 scaling granularity must evenly divide tile shape along M.");
+
+  // Tile along modes in a way that maximizes the TMA box size.
+  using SmemLayoutA = decltype(tile_to_shape(
+      SmemLayoutAtomA{},
+      make_shape(shape<0>(TileShape{}), shape<2>(TileShape{}), Int<DispatchPolicy::Stages>{}),
+      cute::conditional_t< ::cutlass::gemm::detail::is_major<0,StrideA>(), Step<_2,_1,_3>, Step<_1,_2,_3>>{}));
+  using SmemLayoutB = decltype(tile_to_shape(
+      SmemLayoutAtomB{},
+      make_shape(shape<1>(TileShape{}), shape<2>(TileShape{}), Int<DispatchPolicy::Stages>{}),
+      cute::conditional_t< ::cutlass::gemm::detail::is_major<0,StrideB>(), Step<_2,_1,_3>, Step<_1,_2,_3>>{}));
+  
+  // Block scaling gmem-to-smem copy atom 
+  using SmemBlockScalingCopyAtomA = Copy_Atom<SM80_CP_ASYNC_CACHEALWAYS<ElementBlockScale>, ElementBlockScale>;
+  using SmemBlockScalingCopyAtomB = Copy_Atom<SM80_CP_ASYNC_CACHEALWAYS<ElementBlockScale>, ElementBlockScale>;
+  
+  // Block scaling smem layout
+  using SmemLayoutScaleA = Layout<Shape<Int<ScaleMsPerTile>, Int<DispatchPolicy::Stages>>>;
+  using SmemLayoutScaleB = Layout<Shape<Int<DispatchPolicy::Stages>>, Stride<_1>>; // `ScaleNsPerTile` is always 1.
+
+  static_assert(DispatchPolicy::Stages >= 2, "Specialization requires Stages set to value 1 or more.");
+  static_assert(cute::is_base_of<cute::GMMA::DescriptorIterator, typename TiledMma::FrgTypeA>::value &&
+                cute::is_base_of<cute::GMMA::DescriptorIterator, typename TiledMma::FrgTypeB>::value,
+                "MMA atom must source both A and B operand from smem_desc for this mainloop.");
+  static_assert(cute::is_same_v<GmemTiledCopyA, SM90_TMA_LOAD> || cute::is_same_v<GmemTiledCopyA, SM90_TMA_LOAD_MULTICAST>,
+      "GmemTiledCopy - invalid SM90 TMA copy atom specified.");
+  static_assert(cute::is_same_v<GmemTiledCopyB, SM90_TMA_LOAD> || cute::is_same_v<GmemTiledCopyB, SM90_TMA_LOAD_MULTICAST>,
+      "GmemTiledCopy - invalid SM90 TMA copy atom specified.");
+  static_assert(cute::is_same_v<ElementAccumulator, ElementBlockScale>,
+             "ElementAccumulator and ElementBlockScale should be same datatype");
+
+  struct SharedStorage
+  {
+    struct TensorStorage : cute::aligned_struct<128> {
+      cute::array_aligned<typename TiledMma::ValTypeA, cute::cosize_v<SmemLayoutA>> smem_A;  // mxk
+      cute::array_aligned<typename TiledMma::ValTypeB, cute::cosize_v<SmemLayoutB>> smem_B;  // nxk
+      cute::array_aligned<ElementBlockScale, cute::cosize_v<SmemLayoutScaleA>> smem_scale_A; // ScaleMsPerTile x k
+      cute::array_aligned<ElementBlockScale, cute::cosize_v<SmemLayoutScaleB>> smem_scale_B; // 1xk
+    } tensors;
+
+    using PipelineStorage = typename MainloopPipeline::SharedStorage;
+    PipelineStorage pipeline;
+  };
+  using TensorStorage = typename SharedStorage::TensorStorage;
+  using PipelineStorage = typename SharedStorage::PipelineStorage;
+
+  // Host side kernel arguments
+  struct Arguments {
+    ElementA const* ptr_A;
+    StrideA dA;
+    ElementB const* ptr_B;
+    StrideB dB;
+    ElementBlockScale const* ptr_scale_A; 
+    ElementBlockScale const* ptr_scale_B;
+  };
+
+  // Device side kernel params
+  struct Params {
+    // Assumption: StrideA is congruent with Problem_MK
+    using TMA_A = decltype(make_tma_copy_A_sm90(
+        GmemTiledCopyA{},
+        make_tensor(static_cast<ElementA const*>(nullptr), repeat_like(StrideA{}, int32_t(0)), StrideA{}),
+        SmemLayoutA{}(_,_,0),
+        TileShape{},
+        ClusterShape{}));
+    // Assumption: StrideB is congruent with Problem_NK
+    using TMA_B = decltype(make_tma_copy_B_sm90(
+        GmemTiledCopyB{},
+        make_tensor(static_cast<ElementB const*>(nullptr), repeat_like(StrideB{}, int32_t(0)), StrideB{}),
+        SmemLayoutB{}(_,_,0),
+        TileShape{},
+        ClusterShape{}));
+    TMA_A tma_load_a;
+    TMA_B tma_load_b;
+    uint32_t tma_transaction_bytes = TmaTransactionBytes;
+    uint32_t tma_transaction_bytes_mk = TmaTransactionBytesMK;
+    uint32_t tma_transaction_bytes_nk = TmaTransactionBytesNK;
+    // Block scaling factors for A and B
+    ElementBlockScale const* ptr_scale_A; 
+    ElementBlockScale const* ptr_scale_B;
+  };
+
+  //
+  // Methods
+  //
+
+  template <class ProblemShape>
+  static constexpr Params
+  to_underlying_arguments(ProblemShape const& problem_shape, Arguments const& args, void* workspace) {
+    (void) workspace;
+
+    // Optionally append 1s until problem shape is rank-4 (MNKL), in case it is only rank-3 (MNK)
+    auto problem_shape_MNKL = append<4>(problem_shape, 1);
+    auto [M,N,K,L] = problem_shape_MNKL;
+
+    auto ptr_A = reinterpret_cast<ElementA const*>(args.ptr_A);
+    auto ptr_B = reinterpret_cast<ElementB const*>(args.ptr_B);
+
+    Tensor tensor_a = make_tensor(ptr_A, make_layout(make_shape(M,K,L), args.dA));
+    Tensor tensor_b = make_tensor(ptr_B, make_layout(make_shape(N,K,L), args.dB));
+    typename Params::TMA_A tma_load_a = make_tma_copy_A_sm90(
+        GmemTiledCopyA{},
+        tensor_a,
+        SmemLayoutA{}(_,_,cute::Int<0>{}),
+        TileShape{},
+        ClusterShape{});
+    typename Params::TMA_B tma_load_b = make_tma_copy_B_sm90(
+        GmemTiledCopyB{},
+        tensor_b,
+        SmemLayoutB{}(_,_,cute::Int<0>{}),
+        TileShape{},
+        ClusterShape{});
+    uint32_t transaction_bytes_mk = TmaTransactionBytesMK;
+    uint32_t transaction_bytes_nk = TmaTransactionBytesNK;
+    uint32_t transaction_bytes = transaction_bytes_mk + transaction_bytes_nk;
+
+    return {
+      tma_load_a,
+      tma_load_b,
+      transaction_bytes,
+      transaction_bytes_mk,
+      transaction_bytes_nk,
+      args.ptr_scale_A,
+      args.ptr_scale_B
+    };
+  }
+
+  template<class ProblemShape>
+  static bool
+  can_implement(
+      ProblemShape const& problem_shape,
+      [[maybe_unused]] Arguments const& args) {
+    constexpr int tma_alignment_bits = 128;
+    auto problem_shape_MNKL = append<4>(problem_shape, 1);
+    auto [M,N,K,L] = problem_shape_MNKL;
+    
+    bool implementable = true;
+    constexpr int min_tma_aligned_elements_A = tma_alignment_bits / cutlass::sizeof_bits<ElementA>::value;
+    implementable = implementable && cutlass::detail::check_alignment<min_tma_aligned_elements_A>(cute::make_shape(M,K,L), StrideA{});
+    constexpr int min_tma_aligned_elements_B = tma_alignment_bits / cutlass::sizeof_bits<ElementB>::value;
+    implementable = implementable && cutlass::detail::check_alignment<min_tma_aligned_elements_B>(cute::make_shape(N,K,L), StrideB{});
+
+    if (!implementable) {
+      CUTLASS_TRACE_HOST("  CAN IMPLEMENT: Problem Size doesn't meet the minimum alignment requirements for TMA.\n");
+    }
+    return implementable;
+  }
+
+  static constexpr int K_PIPE_MAX = DispatchPolicy::Stages;
+  static constexpr int K_PIPE_MMAS = 1;
+  static constexpr uint32_t TmaTransactionBytesMK =
+        cutlass::bits_to_bytes(size<0>(SmemLayoutA{}) * size<1>(SmemLayoutA{}) * static_cast<uint32_t>(sizeof_bits<ElementA>::value));
+  static constexpr uint32_t TmaTransactionBytesNK =
+        cutlass::bits_to_bytes(size<0>(SmemLayoutB{}) * size<1>(SmemLayoutB{}) * static_cast<uint32_t>(sizeof_bits<ElementB>::value));
+  static constexpr uint32_t TmaTransactionBytes = TmaTransactionBytesMK + TmaTransactionBytesNK;
+
+  /// Issue Tma Descriptor Prefetch -- ideally from a single thread for best performance
+  CUTLASS_DEVICE
+  static void prefetch_tma_descriptors(Params const& mainloop_params)
+  {
+    cute::prefetch_tma_descriptor(mainloop_params.tma_load_a.get_tma_descriptor());
+    cute::prefetch_tma_descriptor(mainloop_params.tma_load_b.get_tma_descriptor());
+  }
+
+  /// Set up the data needed by this collective for load and mma.
+  /// Returns a tuple of tensors. The collective and the kernel layer have the contract
+  /// Returned tuple must contain at least two elements, with the first two elements being:
+  /// gA_mkl - The tma tensor, A after a local tile so it has shape  (BLK_M,BLK_K,m,k,l)
+  /// gB_nkl - The tma tensor, B after a local tile so it has shape  (BLK_N,BLK_K,n,k,l)
+  template <class ProblemShape_MNKL>
+  CUTLASS_DEVICE auto
+  load_init(ProblemShape_MNKL const& problem_shape_MNKL, Params const& mainloop_params) const {
+    using X = Underscore;
+    // Separate out problem shape for convenience
+    auto [M,N,K,L] = problem_shape_MNKL;
+
+    // TMA requires special handling of strides to deal with coord codomain mapping
+    // Represent the full tensors -- get these from TMA
+    Tensor mA_mkl = mainloop_params.tma_load_a.get_tma_tensor(make_shape(M,K,L));                            // (m,k,l)
+    Tensor mB_nkl = mainloop_params.tma_load_b.get_tma_tensor(make_shape(N,K,L));                            // (n,k,l)
+
+    // Make tiled views, defer the slice
+    Tensor gA_mkl = local_tile(mA_mkl, TileShape{}, make_coord(_,_,_), Step<_1, X,_1>{});        // (BLK_M,BLK_K,m,k,l)
+    Tensor gB_nkl = local_tile(mB_nkl, TileShape{}, make_coord(_,_,_), Step< X,_1,_1>{});        // (BLK_N,BLK_K,n,k,l)
+
+    constexpr auto scales_m = Int<ScaleMsPerTile>{};
+    auto tM = get<2>(gA_mkl.shape());
+    auto tN = get<2>(gB_nkl.shape());
+    auto tK = get<3>(gA_mkl.shape());
+
+    // Make the tiled views of scale tensors
+    auto scaleA_shape = make_shape(M / ScaleGranularityM, tK, L); // (scale_m,k,l)
+    auto scaleA_layout = make_ordered_layout(scaleA_shape,  Step<_0, _1, _2>{});
+    auto scaleB_shape = make_shape(tN, tK, L); // (n,k,l)
+    auto scaleB_layout = make_ordered_layout(scaleB_shape, Step<_1, _0, _2>{});
+
+    // Note that mScaleA_mkl and mScaleB_nkl are already blocked tiled in the `m` host and 
+    // gScaleA_mkl and gScaleB_nkl in `g` global memory are same as mScaleA_mkl and mScaleB_nkl.
+    Tensor mScaleA_mkl = make_tensor(make_gmem_ptr(mainloop_params.ptr_scale_A), scaleA_layout); // (scale_m,k,l)
+    Tensor mScaleB_nkl = make_tensor(make_gmem_ptr(mainloop_params.ptr_scale_B), scaleB_layout); // (n,k,l)
+
+    return cute::make_tuple(gA_mkl, gB_nkl, mScaleA_mkl, mScaleB_nkl);
+  }
+
+  /// Perform a collective-scoped matrix multiply-accumulate
+  /// Producer Perspective
+  template <
+    class TensorA, class TensorB,
+    class TensorScaleA, class TensorScaleB,
+    class KTileIterator, class BlockCoord
+  >
+  CUTLASS_DEVICE void
+  load(
+      Params const& mainloop_params,
+      MainloopPipeline pipeline,
+      PipelineState smem_pipe_write,
+      cute::tuple<TensorA, TensorB, TensorScaleA, TensorScaleB> const& load_inputs,
+      BlockCoord const& blk_coord,
+      KTileIterator k_tile_iter, int k_tile_count,
+      int thread_idx,
+      uint32_t block_rank_in_cluster,
+      TensorStorage& shared_tensors) {
+    int lane_predicate = cute::elect_one_sync();
+
+    // Blockscaling: Tma loads for load_input and CpAsync for load_scale
+    Tensor sA = make_tensor(make_smem_ptr(shared_tensors.smem_A.data()), SmemLayoutA{});        // (BLK_M,BLK_K,PIPE)
+    Tensor sB = make_tensor(make_smem_ptr(shared_tensors.smem_B.data()), SmemLayoutB{});        // (BLK_N,BLK_K,PIPE)
+    Tensor sScaleA = make_tensor(cute::make_smem_ptr(shared_tensors.smem_scale_A.data()), SmemLayoutScaleA{}); // (ScaleMsPerTile,k)
+    Tensor sScaleB = make_tensor(cute::make_smem_ptr(shared_tensors.smem_scale_B.data()), SmemLayoutScaleB{}); // (k)
+
+    //
+    // Prepare the TMA loads for A and B
+    //
+
+    constexpr uint32_t cluster_shape_x = get<0>(ClusterShape());
+    uint2 cluster_local_block_id = {block_rank_in_cluster % cluster_shape_x, block_rank_in_cluster / cluster_shape_x};
+
+    Tensor gA_mkl = get<0>(load_inputs);
+    Tensor gB_nkl = get<1>(load_inputs);
+
+    auto block_tma_a = mainloop_params.tma_load_a.get_slice(cluster_local_block_id.y);
+    auto block_tma_b = mainloop_params.tma_load_b.get_slice(cluster_local_block_id.x);
+
+    // Partition the inputs based on the current block coordinates.
+    auto [m_coord, n_coord, k_coord, l_coord] = blk_coord;
+    Tensor gA = gA_mkl(_,_,m_coord,_,l_coord);                                                     // (BLK_M,BLK_K,k)
+    Tensor gB = gB_nkl(_,_,n_coord,_,l_coord);                                                     // (BLK_N,BLK_K,k)
+
+
+    // Block scaling: load_scale has scaling tensors in global memory which are not tiled
+    Tensor mScaleA_mkl = get<2>(load_inputs);
+    Tensor mScaleB_nkl = get<3>(load_inputs);
+    auto scales_m = get<0>(mScaleA_mkl.shape());
+
+    Tensor cScaleA_mkl = make_identity_tensor(mScaleA_mkl.shape());
+
+    Tensor gScaleA = local_tile( 
+      mScaleA_mkl, make_tile(Int<ScaleMsPerTile>{}), 
+      make_coord(m_coord,_,l_coord));                   // (ScaleMsPerTile,k,1)
+    Tensor cScaleA = local_tile( 
+      cScaleA_mkl, make_tile(Int<ScaleMsPerTile>{}), 
+      make_coord(m_coord,_,l_coord));
+    Tensor gScaleB = mScaleB_nkl(n_coord,_,l_coord);                                           // (1,k,1)
+
+    // TODO: test `scale_copy_a` with `ScaleMsPerTile` < 128
+    TiledCopy scale_copy_a = make_tiled_copy(SmemBlockScalingCopyAtomA{}, 
+      Layout<Shape<_32, _1>>{}, Layout<Shape<_4, _1>>{}); // (1,1,1)
+    TiledCopy scale_copy_b = make_tiled_copy(SmemBlockScalingCopyAtomB{}, 
+      Layout<Shape<_1>>{}, Layout<Shape<_1>>{}); // (1,1,1)
+    ThrCopy thr_scale_copy_a = scale_copy_a.get_slice(threadIdx.x);
+    ThrCopy thr_scale_copy_b = scale_copy_b.get_slice(threadIdx.x);
+    
+    Tensor tAgA_ScaleA = thr_scale_copy_a.partition_S(gScaleA);
+    Tensor tAcA_ScaleA = thr_scale_copy_a.partition_S(cScaleA);
+    Tensor tAsA_ScaleA = thr_scale_copy_a.partition_D(sScaleA);
+    
+    Tensor tBgB_ScaleB = thr_scale_copy_b.partition_S(gScaleB);
+    Tensor tBsB_ScaleB = thr_scale_copy_b.partition_D(sScaleB);
+
+    // Applies the mapping from block_tma_a
+    Tensor tAgA = block_tma_a.partition_S(gA);                                              // (TMA,TMA_M,TMA_K,k)
+    Tensor tAsA = block_tma_a.partition_D(sA);                                              // (TMA,TMA_M,TMA_K,PIPE)
+
+    Tensor tBgB = block_tma_b.partition_S(gB);                                              // (TMA,TMA_N,TMA_K,k)
+    Tensor tBsB = block_tma_b.partition_D(sB);                                              // (TMA,TMA_N,TMA_K,PIPE)
+
+    uint16_t mcast_mask_a = 0;
+    uint16_t mcast_mask_b = 0;
+
+    // Issue TmaLoads for GEMM operands A/B and CpAsync for scale tensors
+    // Maps the tile -> block, value
+    if constexpr (cute::is_same_v<GmemTiledCopyA, SM90_TMA_LOAD_MULTICAST>) {
+      auto block_layout = Layout<typename DispatchPolicy::ClusterShape>{};                       // (m,n) -> block_id
+      for (int n = 0; n < size<1>(block_layout); ++n) {
+        mcast_mask_a |= (uint16_t(1) << block_layout(cluster_local_block_id.x,n,Int<0>{}));
+      }
+    }
+
+    if constexpr (cute::is_same_v<GmemTiledCopyB, SM90_TMA_LOAD_MULTICAST>) {
+      auto block_layout = Layout<typename DispatchPolicy::ClusterShape>{};                       // (m,n) -> block_id
+      for (int m = 0; m < size<0>(block_layout); ++m) {
+        mcast_mask_b |= (uint16_t(1) << block_layout(m,cluster_local_block_id.y,Int<0>{}));
+      }
+    }
+
+    // Allocate predicate tensors for a_scales (since we can't guarantee that 
+    // all scales are valid, since we could have a partial tiles along M)
+    Tensor tApA_ScaleA = make_tensor<bool>(shape(tAsA_ScaleA(_,_,0)));
+    #pragma unroll
+    for (int i = 0; i < size(tApA_ScaleA); ++i) {
+      tApA_ScaleA(i) = get<0>(tAcA_ScaleA(i)) < scales_m;
+    }
+
+    // Mainloop
+    CUTLASS_PRAGMA_NO_UNROLL
+    for ( ; k_tile_count > 0; --k_tile_count) {
+      // LOCK smem_pipe_write for _writing_
+      pipeline.producer_acquire(smem_pipe_write);
+
+      //
+      // Copy gmem to smem for *k_tile_iter
+      //
+      int write_stage = smem_pipe_write.index();
+      using BarrierType = typename MainloopPipeline::ProducerBarrierType;
+      BarrierType* tma_barrier = pipeline.producer_get_barrier(smem_pipe_write);
+
+      // Copy operands A and B from global memory to shared memory
+      if (lane_predicate) copy(mainloop_params.tma_load_a.with(*tma_barrier, mcast_mask_a), tAgA(_,_,_,*k_tile_iter), tAsA(_,_,_,write_stage));
+      if (lane_predicate) copy(mainloop_params.tma_load_b.with(*tma_barrier, mcast_mask_b), tBgB(_,_,_,*k_tile_iter), tBsB(_,_,_,write_stage));
+
+      // Copy scale tensors from global memory to shared memory
+      copy_if(scale_copy_a, tApA_ScaleA, tAgA_ScaleA(_,_,*k_tile_iter), tAsA_ScaleA(_,_,write_stage));
+      copy(scale_copy_b, tBgB_ScaleB(_,*k_tile_iter), tBsB_ScaleB(_,write_stage));
+      pipeline.producer_commit(smem_pipe_write, cutlass::arch::cpasync_barrier_arrive_noinc);
+
+      ++k_tile_iter;
+
+      // Advance smem_pipe_write
+      ++smem_pipe_write;
+    }
+  }
+
+  /// Perform a Producer Epilogue to prevent early exit of blocks in a Cluster
+  CUTLASS_DEVICE void
+  load_tail(
+      MainloopPipeline pipeline,
+      PipelineState smem_pipe_write) {
+    int lane_predicate = cute::elect_one_sync();
+
+    // Issue the epilogue waits
+    if (lane_predicate) {
+      /* This helps avoid early exit of blocks in Cluster
+       * Waits for all stages to either be released (all
+       * Consumer UNLOCKs), or if the stage was never used
+       * then would just be acquired since the phase was
+       * still inverted from make_producer_start_state
+       */
+      pipeline.producer_tail(smem_pipe_write);
+    }
+  }
+
+  /// Perform a collective-scoped matrix multiply-accumulate
+  /// Consumer Perspective
+  template <
+    class FrgTensorC
+  >
+  CUTLASS_DEVICE void
+  mma(MainloopPipeline pipeline,
+      PipelineState smem_pipe_read,
+      FrgTensorC& accum,
+      int k_tile_count,
+      int thread_idx,
+      TensorStorage& shared_tensors,
+      Params const& mainloop_params) {
+
+
+    static_assert(is_rmem<FrgTensorC>::value, "C tensor must be rmem resident.");
+    static_assert(cute::rank(SmemLayoutA{}) == 3, "Smem layout must be rank 3.");
+    static_assert(cute::rank(SmemLayoutB{}) == 3, "Smem layout must be rank 3.");
+    static_assert(cute::is_void_v<SmemCopyAtomA>,
+      "SM90 GMMA mainloops cannot have a non-void copy atom for smem sourced instructions.");
+    static_assert(cute::is_void_v<SmemCopyAtomB>,
+      "SM90 GMMA mainloops cannot have a non-void copy atom for smem sourced instructions.");
+
+    Tensor sA = make_tensor(make_smem_ptr(shared_tensors.smem_A.data()), SmemLayoutA{});          // (BLK_M,BLK_K,PIPE)
+    Tensor sB = make_tensor(make_smem_ptr(shared_tensors.smem_B.data()), SmemLayoutB{});          // (BLK_N,BLK_K,PIPE)
+    
+    // Block scaling
+    Tensor sScaleAViewAsC = make_tensor(cute::make_smem_ptr(shared_tensors.smem_scale_A.data()),
+      Layout<
+        Shape<Shape<Int<ScaleGranularityM>, Int<ScaleMsPerTile>>, cute::tuple_element_t<1, TileShape>, Int<DispatchPolicy::Stages>>,
+        Stride<Stride<_0, _1>, _0, Int<ScaleMsPerTile>>
+      >{}); // ((ScaleGranularityM,ScaleMsPerTile),n,k)
+    Tensor sScaleB = make_tensor(cute::make_smem_ptr(shared_tensors.smem_scale_B.data()), SmemLayoutScaleB{}); // (k)
+
+    //
+    // Define C accumulators and A/B partitioning
+    //
+    
+    // Layout of warp group to thread mapping
+
+    static_assert(stride<0>(typename TiledMma::ALayout{}) == 0 and 
+                  stride<0>(typename TiledMma::BLayout{}) == 0 and
+                  size<0>(typename TiledMma::ALayout{}) == NumThreadsPerWarpGroup and
+                  size<0>(typename TiledMma::BLayout{}) == NumThreadsPerWarpGroup, 
+                  "Stride of the first mode must be 0 and the size of the mode must be NumThreadsPerWarpGroup");
+
+    constexpr int MmaWarpGroups = size(TiledMma{}) / NumThreadsPerWarpGroup;
+    Layout warp_group_thread_layout = make_layout(Int<MmaWarpGroups>{}, 
+                                                  Int<NumThreadsPerWarpGroup>{});
+
+    int warp_group_idx = __shfl_sync(0xFFFFFFFF, thread_idx / NumThreadsPerWarpGroup, 0);
+
+    TiledMma tiled_mma;
+    auto thread_mma = tiled_mma.get_slice(warp_group_thread_layout(warp_group_idx));
+
+    Tensor tCsScaleAViewAsC = tiled_mma.get_slice(thread_idx).partition_C(sScaleAViewAsC);    // (MMA,MMA_M,MMA_N,PIPE), `thread_mma` above is correct when partitioning A and B, but it is not correct when partitioning C.
+
+    Tensor tCsA = thread_mma.partition_A(sA);                                                 // (MMA,MMA_M,MMA_K,PIPE)
+    Tensor tCsB = thread_mma.partition_B(sB);                                                 // (MMA,MMA_N,MMA_K,PIPE)
+
+    // Allocate "fragments/descriptors"
+    Tensor tCrA = thread_mma.make_fragment_A(tCsA);                                           // (MMA,MMA_M,MMA_K,PIPE)
+    Tensor tCrB = thread_mma.make_fragment_B(tCsB);                                           // (MMA,MMA_N,MMA_K,PIPE)
+
+    CUTE_STATIC_ASSERT_V(size<1>(tCsA) == size<1>(accum));                                                         // M
+    CUTE_STATIC_ASSERT_V(size<1>(tCsB) == size<2>(accum));                                                         // N
+    CUTE_STATIC_ASSERT_V(size<2>(tCsA) == size<2>(tCsB));                                                          // K
+    CUTE_STATIC_ASSERT_V(size<3>(tCsA) == size<3>(tCsB));                                                       // PIPE
+    CUTE_STATIC_ASSERT_V(Int<DispatchPolicy::Stages>{} == size<2>(sA));                                         // PIPE
+    CUTE_STATIC_ASSERT_V(Int<DispatchPolicy::Stages>{} == size<2>(sB));                                         // PIPE
+
+    //
+    // PIPELINED MAIN LOOP
+    //
+    static_assert((0 <= K_PIPE_MMAS) && (K_PIPE_MMAS <  K_PIPE_MAX),
+        "ERROR : Incorrect number of MMAs in flight");
+
+    // We release buffers to producer warps(dma load) with some mmas in flight
+    PipelineState smem_pipe_release = smem_pipe_read;
+    
+    // Per block scale values for operand A and B
+
+    using RegLayoutScaleAViewAsC = decltype(make_layout_like(tCsScaleAViewAsC(_, _, _, 0).layout())); // `make_layout_like` makes a compact layout.
+    using RegLayoutScaleAEssential = decltype(filter_zeros(RegLayoutScaleAViewAsC{}.stride(), RegLayoutScaleAViewAsC{}.shape())); // an interface to traverse the underlying storage for the compact layout mentioned above
+
+    Tensor tCrScaleAViewAsC = make_tensor<ElementBlockScale>(RegLayoutScaleAViewAsC{});              // (MMA,MMA_M,MMA_N)
+    ElementBlockScale scale_b;
+
+    // Prologue GMMAs
+    int prologue_mma_count = min(K_PIPE_MMAS, k_tile_count);
+
+    tiled_mma.accumulate_ = GMMA::ScaleOut::Zero;
+
+    GmmaFP8AccumulationWithScale accumulation(accum, size<2>(TileShape{}) / size<2>(typename TiledMma::AtomShape_MNK{}), size<2>(tCrA));
+    warpgroup_fence_operand(accumulation());
+    CUTLASS_PRAGMA_UNROLL
+    for (int k_tile_prologue = prologue_mma_count; k_tile_prologue > 0; --k_tile_prologue)
+    {
+      // WAIT on smem_pipe_read until its data are available (phase bit flips from rdPhaseBit value)
+      auto barrier_token = pipeline.consumer_try_wait(smem_pipe_read);
+      pipeline.consumer_wait(smem_pipe_read, barrier_token);
+
+      if (accumulation.prepare_if_needed()) {
+        tiled_mma.accumulate_ = GMMA::ScaleOut::Zero;
+      }
+
+      int read_stage = smem_pipe_read.index();
+      
+      // Load per block scale values from shared memory to registers.
+      scale_b = sScaleB[read_stage];
+      CUTLASS_PRAGMA_UNROLL
+      for (int i = 0; i < size(RegLayoutScaleAEssential{}); i++) {
+        tCrScaleAViewAsC.data()[i] = tCsScaleAViewAsC(_, _, _, read_stage)(idx2crd(i, RegLayoutScaleAEssential{}));
+      }
+      if constexpr (ScaleMsPerTile == 1) {
+        static_assert(size(RegLayoutScaleAEssential{}) == 1);
+        tCrScaleAViewAsC.data()[0] = __shfl_sync(0xffffffff, tCrScaleAViewAsC.data()[0] * scale_b, 0); // `tCrScaleAViewAsC.data()[0]` are all same in a warp group when `ScaleMsPerTile == 1`.
+      } else {
+        CUTLASS_PRAGMA_UNROLL
+        for (int i = 0; i < size(RegLayoutScaleAEssential{}); i++) {
+          tCrScaleAViewAsC.data()[i] = tCrScaleAViewAsC.data()[i] * scale_b;
+        }
+      }
+
+      warpgroup_arrive();
+      // Unroll the K mode manually to set scale D to 1
+      CUTLASS_PRAGMA_UNROLL
+      for (int k_block = 0; k_block < size<2>(tCrA); ++k_block) {
+        // (V,M,K) x (V,N,K) => (V,M,N)
+        cute::gemm(tiled_mma, tCrA(_,_,k_block,read_stage), tCrB(_,_,k_block,read_stage), accumulation());
+        tiled_mma.accumulate_ = GMMA::ScaleOut::One;
+      }
+      warpgroup_commit_batch();
+
+      // Block scale the accumulators with reg tensor `tCrScaleAViewAsC`
+      accumulation.scale_if_needed(tCrScaleAViewAsC);
+
+      ++smem_pipe_read;
+    }
+
+    warpgroup_fence_operand(accumulation());
+    // Mainloop GMMAs
+    k_tile_count -= prologue_mma_count;
+
+    CUTLASS_PRAGMA_NO_UNROLL
+    for ( ; k_tile_count > 0; --k_tile_count)
+    {
+      // WAIT on smem_pipe_read until its data are available (phase bit flips from rdPhaseBit value)
+      auto barrier_token = pipeline.consumer_try_wait(smem_pipe_read);
+      pipeline.consumer_wait(smem_pipe_read, barrier_token);
+
+      //
+      // Compute on k_tile
+      //
+
+      int read_stage = smem_pipe_read.index();
+
+      // Load per block scale values from shared memory to registers (at most twice per block along M and exactly once per block along N) 
+      scale_b = sScaleB[read_stage];
+      CUTLASS_PRAGMA_UNROLL
+      for (int i = 0; i < size(RegLayoutScaleAEssential{}); i++) {
+        tCrScaleAViewAsC.data()[i] = tCsScaleAViewAsC(_, _, _, read_stage)(idx2crd(i, RegLayoutScaleAEssential{}));
+      }
+      if constexpr (ScaleMsPerTile == 1) {
+        static_assert(size(RegLayoutScaleAEssential{}) == 1);
+        tCrScaleAViewAsC.data()[0] = __shfl_sync(0xffffffff, tCrScaleAViewAsC.data()[0] * scale_b, 0); // `tCrScaleAViewAsC.data()[0]` are all same in a warp group when `ScaleMsPerTile == 1`.
+      } else {
+        CUTLASS_PRAGMA_UNROLL
+        for (int i = 0; i < size(RegLayoutScaleAEssential{}); i++) {
+          tCrScaleAViewAsC.data()[i] = tCrScaleAViewAsC.data()[i] * scale_b;
+        }
+      }
+
+      if (accumulation.prepare_if_needed()) {
+        tiled_mma.accumulate_ = GMMA::ScaleOut::Zero;
+      }
+
+      warpgroup_fence_operand(accumulation());
+      warpgroup_arrive();
+      // Unroll the K mode manually to set scale D to 1
+      CUTLASS_PRAGMA_UNROLL
+      for (int k_block = 0; k_block < size<2>(tCrA); ++k_block) {
+        // (V,M,K) x (V,N,K) => (V,M,N)
+        cute::gemm(tiled_mma, tCrA(_,_,k_block,read_stage), tCrB(_,_,k_block,read_stage), accumulation());
+        tiled_mma.accumulate_ = GMMA::ScaleOut::One;
+      }
+      warpgroup_commit_batch();
+
+      /// Wait on the GMMA barrier for K_PIPE_MMAS (or fewer) outstanding to ensure smem_pipe_write is consumed
+      warpgroup_wait<K_PIPE_MMAS>();
+      warpgroup_fence_operand(accumulation());
+
+      // Block scale the accumulators with reg tensor `tCrScaleAViewAsC`
+      accumulation.scale_if_needed(tCrScaleAViewAsC);
+
+      pipeline.consumer_release(smem_pipe_release);                 // UNLOCK smem_pipe_release, done _computing_ on it
+
+      // Advance smem_pipe_read and smem_pipe_release
+      ++smem_pipe_read;
+      ++smem_pipe_release;
+    }
+    
+    accumulation.scale_residue_if_needed(tCrScaleAViewAsC);
+
+    warpgroup_fence_operand(accumulation());
+  }
+
+  /// Perform a Consumer Epilogue to release all buffers
+  CUTLASS_DEVICE void
+  mma_tail(MainloopPipeline pipeline, PipelineState smem_pipe_release, int k_tile_count) {
+    // Prologue GMMAs
+    int prologue_mma_count = min(K_PIPE_MMAS, k_tile_count);
+    k_tile_count -= prologue_mma_count;
+
+    smem_pipe_release.advance(k_tile_count);
+
+    // Wait on all GMMAs to complete
+    warpgroup_wait<0>();
+
+    for (int count = 0; count < prologue_mma_count; ++count) {
+      pipeline.consumer_release(smem_pipe_release);                 // UNLOCK smem_pipe_release, done _computing_ on it
+      ++smem_pipe_release;
+    }
+  }
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+} // namespace cutlass::gemm::collective
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
diff --git a/csrc/cutlass_extensions/gemm/dispatch_policy.hpp b/csrc/cutlass_extensions/gemm/dispatch_policy.hpp
new file mode 100644
index 0000000000000..df809e27a3efe
--- /dev/null
+++ b/csrc/cutlass_extensions/gemm/dispatch_policy.hpp
@@ -0,0 +1,39 @@
+#pragma once
+
+#include "cutlass/gemm/dispatch_policy.hpp"
+
+namespace cutlass::gemm {
+
+//////////////////////////////////////////////////////////////////////////////
+
+// FP8 related policies (including Blocked Scaled Accumulation)
+//  `ScaleGranularityM` specifies scaling granularity along M, while zero-value
+//  `ScaleGranularityM` indicates that scaling granularity is
+//  `size<0>(TileShape_MNK{})` along M.
+template <int ScaleGranularityM = 0>
+struct KernelTmaWarpSpecializedCooperativeFP8BlockScaledSubGroupMAccum
+    : KernelTmaWarpSpecializedCooperative {};
+
+// n-buffer in smem (Hopper TMA), pipelined with Hopper GMMA and TMA, Warp
+// specialized dynamic schedule For FP8 kernels with Block Scaling
+template <int Stages_, class ClusterShape_ = Shape<_1, _1, _1>,
+          class KernelSchedule = KernelTmaWarpSpecialized,
+          int ScaleGranularityM =
+              0  // `ScaleGranularityM` specifies scaling granularity along M,
+                 // while zero-value `ScaleGranularityM` indicates that scaling
+                 // granularity is `size<0>(TileShape_MNK{})` along M.
+          >
+struct MainloopSm90TmaGmmaWarpSpecializedBlockScalingSubGroupMFP8
+    : MainloopSm90TmaGmmaWarpSpecialized<Stages_, ClusterShape_,
+                                         KernelSchedule> {
+  static_assert(
+      cute::is_same_v<
+          KernelSchedule,
+          KernelTmaWarpSpecializedCooperativeFP8BlockScaledSubGroupMAccum<
+              ScaleGranularityM>>,
+      "KernelSchedule must be one of the warp specialized policies");
+};
+
+//////////////////////////////////////////////////////////////////////////////
+
+}  // namespace cutlass::gemm
\ No newline at end of file
diff --git a/csrc/cutlass_extensions/vllm_collective_builder.cuh b/csrc/cutlass_extensions/vllm_collective_builder.cuh
index 085ee1290031f..e7fbba4cd4b0d 100644
--- a/csrc/cutlass_extensions/vllm_collective_builder.cuh
+++ b/csrc/cutlass_extensions/vllm_collective_builder.cuh
@@ -1,6 +1,6 @@
 #pragma once
 
-#include "cutlass/gemm/collective/collective_builder.hpp"
+#include "cutlass_extensions/gemm/collective/collective_builder.hpp"
 
 namespace cutlass::gemm::collective {
 using namespace cute;
diff --git a/csrc/cutlass_extensions/vllm_cutlass_library_extension.py b/csrc/cutlass_extensions/vllm_cutlass_library_extension.py
index b401736c9824b..d5a5e2ef83dd8 100644
--- a/csrc/cutlass_extensions/vllm_cutlass_library_extension.py
+++ b/csrc/cutlass_extensions/vllm_cutlass_library_extension.py
@@ -1,3 +1,5 @@
+# SPDX-License-Identifier: Apache-2.0
+
 import enum
 from typing import Dict, Union
 
diff --git a/csrc/moe/marlin_kernels/marlin_moe_kernel.h b/csrc/moe/marlin_kernels/marlin_moe_kernel.h
index a217401b3d7c2..47ecf109d0f53 100644
--- a/csrc/moe/marlin_kernels/marlin_moe_kernel.h
+++ b/csrc/moe/marlin_kernels/marlin_moe_kernel.h
@@ -138,8 +138,8 @@ __device__ inline FragB dequant<vllm::kU4B8.id()>(int q) {
   const int HI = 0x00f000f0;
   const int EX = 0x64006400;
   // Guarantee that the `(a & b) | c` operations are LOP3s.
-  int lo = lop3<(0xf0 & 0xcc) | 0xaa>(q, LO, EX);
-  int hi = lop3<(0xf0 & 0xcc) | 0xaa>(q, HI, EX);
+  int lo = lop3 < (0xf0 & 0xcc) | 0xaa > (q, LO, EX);
+  int hi = lop3 < (0xf0 & 0xcc) | 0xaa > (q, HI, EX);
   // We want signed int4 outputs, hence we fuse the `-8` symmetric zero point
   // directly into `SUB` and `ADD`.
   const int SUB = 0x64086408;
@@ -182,8 +182,8 @@ __device__ inline FragB dequant<vllm::kU4.id()>(int q) {
   const int HI = 0x00f000f0;
   const int EX = 0x64006400;
   // Guarantee that the `(a & b) | c` operations are LOP3s.
-  int lo = lop3<(0xf0 & 0xcc) | 0xaa>(q, LO, EX);
-  int hi = lop3<(0xf0 & 0xcc) | 0xaa>(q, HI, EX);
+  int lo = lop3 < (0xf0 & 0xcc) | 0xaa > (q, LO, EX);
+  int hi = lop3 < (0xf0 & 0xcc) | 0xaa > (q, HI, EX);
 
   const int SUB = 0x64006400;
   const int MUL = 0x2c002c00;
diff --git a/csrc/moe/moe_align_sum_kernels.cu b/csrc/moe/moe_align_sum_kernels.cu
index d609ce1697df3..ff74a42d7e810 100644
--- a/csrc/moe/moe_align_sum_kernels.cu
+++ b/csrc/moe/moe_align_sum_kernels.cu
@@ -33,7 +33,9 @@ __global__ void moe_align_block_size_kernel(scalar_t* __restrict__ topk_ids,
 
   extern __shared__ int32_t shared_mem[];
   int32_t* cumsum = shared_mem;  // 1d tensor with shape (num_experts + 1)
-  token_cnts_t* tokens_cnts = (token_cnts_t*)(shared_mem + blockDim.x + 1);
+  token_cnts_t* tokens_cnts =
+      (token_cnts_t*)(shared_mem + num_experts +
+                      1);  // 2d tensor with shape (blockDim.x + 1, num_experts)
 
   for (int i = 0; i < num_experts; ++i) {
     tokens_cnts[index(num_experts, threadIdx.x + 1, i)] = 0;
@@ -195,6 +197,72 @@ __global__ void moe_align_block_size_global_mem_kernel(
   }
 }
 
+// taken from
+// https://github.com/sgl-project/sglang/commit/ded9fcd09a43d5e7d5bb31a2bc3e9fc21bf65d2a
+template <typename scalar_t>
+__global__ void sgl_moe_align_block_size_kernel(
+    scalar_t* __restrict__ topk_ids, int32_t* sorted_token_ids,
+    int32_t* expert_ids, int32_t* total_tokens_post_pad, int32_t num_experts,
+    int32_t block_size, size_t numel, int32_t* cumsum) {
+  __shared__ int32_t shared_counts[32][8];
+  __shared__ int32_t local_offsets[256];
+
+  const int warp_id = threadIdx.x / WARP_SIZE;
+  const int lane_id = threadIdx.x % WARP_SIZE;
+  const int experts_per_warp = 8;
+  const int my_expert_start = warp_id * experts_per_warp;
+
+  for (int i = 0; i < experts_per_warp; ++i) {
+    if (my_expert_start + i < num_experts) {
+      shared_counts[warp_id][i] = 0;
+    }
+  }
+
+  const size_t tokens_per_thread = CEILDIV(numel, blockDim.x);
+  const size_t start_idx = threadIdx.x * tokens_per_thread;
+
+  for (int i = start_idx; i < numel && i < start_idx + tokens_per_thread; ++i) {
+    int expert_id = topk_ids[i];
+    int warp_idx = expert_id / experts_per_warp;
+    int expert_offset = expert_id % experts_per_warp;
+    atomicAdd(&shared_counts[warp_idx][expert_offset], 1);
+  }
+
+  __syncthreads();
+
+  if (threadIdx.x == 0) {
+    cumsum[0] = 0;
+    for (int i = 1; i <= num_experts; ++i) {
+      int expert_count = 0;
+      int warp_idx = (i - 1) / experts_per_warp;
+      int expert_offset = (i - 1) % experts_per_warp;
+      expert_count = shared_counts[warp_idx][expert_offset];
+
+      cumsum[i] =
+          cumsum[i - 1] + CEILDIV(expert_count, block_size) * block_size;
+    }
+    *total_tokens_post_pad = cumsum[num_experts];
+  }
+
+  __syncthreads();
+
+  if (threadIdx.x < num_experts) {
+    for (int i = cumsum[threadIdx.x]; i < cumsum[threadIdx.x + 1];
+         i += block_size) {
+      expert_ids[i / block_size] = threadIdx.x;
+    }
+    local_offsets[threadIdx.x] = cumsum[threadIdx.x];
+  }
+
+  __syncthreads();
+
+  for (int i = start_idx; i < numel && i < start_idx + tokens_per_thread; ++i) {
+    int32_t expert_id = topk_ids[i];
+    int32_t rank_post_pad = atomicAdd(&local_offsets[expert_id], 1);
+    sorted_token_ids[rank_post_pad] = i;
+  }
+}
+
 template <typename scalar_t, int TOPK>
 __global__ void moe_sum_kernel(
     scalar_t* __restrict__ out,          // [..., d]
@@ -303,6 +371,32 @@ void moe_align_block_size(torch::Tensor topk_ids, int64_t num_experts,
   }
 }
 
+void sgl_moe_align_block_size(torch::Tensor topk_ids, int64_t num_experts,
+                              int64_t block_size,
+                              torch::Tensor sorted_token_ids,
+                              torch::Tensor experts_ids,
+                              torch::Tensor num_tokens_post_pad) {
+  const cudaStream_t stream = at::cuda::getCurrentCUDAStream();
+  VLLM_DISPATCH_INTEGRAL_TYPES(
+      topk_ids.scalar_type(), "sgl_moe_align_block_size_kernel", [&] {
+        // calc needed amount of shared mem for `tokens_cnts` and `cumsum`
+        // tensors
+        auto options_int =
+            torch::TensorOptions().dtype(torch::kInt).device(topk_ids.device());
+        // torch::Tensor token_cnts_buffer =
+        //     torch::empty({(num_experts + 1) * num_experts}, options_int);
+        torch::Tensor cumsum_buffer =
+            torch::empty({num_experts + 1}, options_int);
+
+        auto kernel = vllm::moe::sgl_moe_align_block_size_kernel<scalar_t>;
+        kernel<<<1, 1024, 0, stream>>>(
+            topk_ids.data_ptr<scalar_t>(), sorted_token_ids.data_ptr<int32_t>(),
+            experts_ids.data_ptr<int32_t>(),
+            num_tokens_post_pad.data_ptr<int32_t>(), num_experts, block_size,
+            topk_ids.numel(), cumsum_buffer.data_ptr<int32_t>());
+      });
+}
+
 void moe_sum(torch::Tensor& input,   // [num_tokens, topk, hidden_size]
              torch::Tensor& output)  // [num_tokens, hidden_size]
 {
diff --git a/csrc/moe/moe_ops.h b/csrc/moe/moe_ops.h
index 596cc0aa6c855..66bb5f41b7f78 100644
--- a/csrc/moe/moe_ops.h
+++ b/csrc/moe/moe_ops.h
@@ -12,3 +12,9 @@ void moe_align_block_size(torch::Tensor topk_ids, int64_t num_experts,
                           int64_t block_size, torch::Tensor sorted_token_ids,
                           torch::Tensor experts_ids,
                           torch::Tensor num_tokens_post_pad);
+
+void sgl_moe_align_block_size(torch::Tensor topk_ids, int64_t num_experts,
+                              int64_t block_size,
+                              torch::Tensor sorted_token_ids,
+                              torch::Tensor experts_ids,
+                              torch::Tensor num_tokens_post_pad);
diff --git a/csrc/moe/torch_bindings.cpp b/csrc/moe/torch_bindings.cpp
index f3a558c14ab93..8540633dcc8b0 100644
--- a/csrc/moe/torch_bindings.cpp
+++ b/csrc/moe/torch_bindings.cpp
@@ -22,6 +22,15 @@ TORCH_LIBRARY_EXPAND(TORCH_EXTENSION_NAME, m) {
       "                     Tensor! num_tokens_post_pad) -> ()");
   m.impl("moe_align_block_size", torch::kCUDA, &moe_align_block_size);
 
+  // temporarily adapted from
+  // https://github.com/sgl-project/sglang/commit/ded9fcd09a43d5e7d5bb31a2bc3e9fc21bf65d2a
+  m.def(
+      "sgl_moe_align_block_size(Tensor topk_ids, int num_experts,"
+      "                         int block_size, Tensor! sorted_token_ids,"
+      "                         Tensor! experts_ids,"
+      "                         Tensor! num_tokens_post_pad) -> ()");
+  m.impl("sgl_moe_align_block_size", torch::kCUDA, &sgl_moe_align_block_size);
+
 #ifndef USE_ROCM
   m.def(
       "marlin_gemm_moe(Tensor! a, Tensor! b_q_weights, Tensor! sorted_ids, "
diff --git a/csrc/ops.h b/csrc/ops.h
index 346898964010d..e39d4ef3188a3 100644
--- a/csrc/ops.h
+++ b/csrc/ops.h
@@ -153,6 +153,7 @@ torch::Tensor ggml_mul_mat_a8(torch::Tensor W, torch::Tensor X, int64_t type,
 
 #ifndef USE_ROCM
 bool cutlass_scaled_mm_supports_fp8(int64_t cuda_device_capability);
+bool cutlass_scaled_mm_supports_block_fp8(int64_t cuda_device_capability);
 
 void cutlass_scaled_mm(torch::Tensor& out, torch::Tensor const& a,
                        torch::Tensor const& b, torch::Tensor const& a_scales,
diff --git a/csrc/quantization/cutlass_w8a8/c3x/cutlass_gemm_caller.cuh b/csrc/quantization/cutlass_w8a8/c3x/cutlass_gemm_caller.cuh
new file mode 100644
index 0000000000000..9ac7eee7204ec
--- /dev/null
+++ b/csrc/quantization/cutlass_w8a8/c3x/cutlass_gemm_caller.cuh
@@ -0,0 +1,93 @@
+#pragma once
+
+// clang-format will break include orders
+// clang-format off
+#include <torch/all.h>
+
+#include <ATen/cuda/CUDAContext.h>
+
+#include "cutlass/cutlass.h"
+
+#include "cute/tensor.hpp"
+#include "cute/atom/mma_atom.hpp"
+#include "cutlass/numeric_types.h"
+
+#include "cutlass/gemm/device/gemm_universal_adapter.h"
+#include "cutlass/gemm/kernel/gemm_universal.hpp"
+#include "cutlass/epilogue/collective/collective_builder.hpp"
+#include "cutlass/gemm/collective/collective_builder.hpp"
+
+#include "core/math.hpp"
+#include "cutlass_extensions/common.hpp"
+// clang-format on
+
+namespace vllm::c3x {
+
+static inline cute::Shape<int, int, int, int> get_problem_shape(
+    torch::Tensor const& a, torch::Tensor const& b) {
+  int32_t m = a.size(0), n = b.size(1), k = a.size(1);
+  return {m, n, k, 1};
+}
+
+template <typename GemmKernel>
+void cutlass_gemm_caller(torch::Device device,
+                         cute::Shape<int, int, int, int> prob_shape,
+                         typename GemmKernel::MainloopArguments mainloop_args,
+                         typename GemmKernel::EpilogueArguments epilogue_args) {
+  typename GemmKernel::Arguments args{cutlass::gemm::GemmUniversalMode::kGemm,
+                                      prob_shape, mainloop_args, epilogue_args};
+
+  // Launch the CUTLASS GEMM kernel.
+  using GemmOp = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
+  GemmOp gemm_op;
+  CUTLASS_CHECK(gemm_op.can_implement(args));
+
+  size_t workspace_size = gemm_op.get_workspace_size(args);
+  auto const workspace_options =
+      torch::TensorOptions().dtype(torch::kUInt8).device(device);
+  auto workspace = torch::empty(workspace_size, workspace_options);
+
+  auto stream = at::cuda::getCurrentCUDAStream(device.index());
+
+  cutlass::Status status = gemm_op.run(args, workspace.data_ptr(), stream);
+  CUTLASS_CHECK(status);
+}
+
+template <typename Gemm, typename... EpilogueArgs>
+void cutlass_gemm_caller(torch::Tensor& out, torch::Tensor const& a,
+                         torch::Tensor const& b,
+                         EpilogueArgs&&... epilogue_params) {
+  using ElementAB = typename Gemm::ElementAB;
+  using ElementD = typename Gemm::ElementD;
+  using GemmKernel = typename Gemm::GemmKernel;
+
+  int64_t lda = a.stride(0);
+  int64_t ldb = b.stride(1);
+  int64_t ldc = out.stride(0);
+
+  using StrideA = cute::Stride<int64_t, cute::Int<1>, int64_t>;
+  using StrideB = cute::Stride<int64_t, cute::Int<1>, int64_t>;
+  using StrideC = typename Gemm::StrideC;
+
+  StrideA a_stride{lda, cute::Int<1>{}, 0};
+  StrideB b_stride{ldb, cute::Int<1>{}, 0};
+  StrideC c_stride{ldc, cute::Int<1>{}, cute::Int<0>{}};
+
+  typename GemmKernel::ProblemShape prob_shape = get_problem_shape(a, b);
+
+  auto a_ptr = static_cast<ElementAB*>(a.data_ptr());
+  auto b_ptr = static_cast<ElementAB*>(b.data_ptr());
+  typename GemmKernel::MainloopArguments mainloop_args{a_ptr, a_stride, b_ptr,
+                                                       b_stride};
+
+  auto c_ptr = static_cast<ElementD*>(out.data_ptr());
+  typename GemmKernel::EpilogueArguments epilogue_args{
+      Gemm::Epilogue::prepare_args(
+          std::forward<EpilogueArgs>(epilogue_params)...),
+      c_ptr, c_stride, c_ptr, c_stride};
+
+  cutlass_gemm_caller<GemmKernel>(a.device(), prob_shape, mainloop_args,
+                                  epilogue_args);
+}
+
+}  // namespace vllm::c3x
\ No newline at end of file
diff --git a/csrc/quantization/cutlass_w8a8/scaled_mm_c3x.cuh b/csrc/quantization/cutlass_w8a8/c3x/scaled_mm.cuh
similarity index 51%
rename from csrc/quantization/cutlass_w8a8/scaled_mm_c3x.cuh
rename to csrc/quantization/cutlass_w8a8/c3x/scaled_mm.cuh
index d4bc2f0ade50d..9227ebb735245 100644
--- a/csrc/quantization/cutlass_w8a8/scaled_mm_c3x.cuh
+++ b/csrc/quantization/cutlass_w8a8/c3x/scaled_mm.cuh
@@ -2,9 +2,6 @@
 
 // clang-format will break include orders
 // clang-format off
-#include <torch/all.h>
-
-#include <ATen/cuda/CUDAContext.h>
 
 #include "cutlass/cutlass.h"
 
@@ -32,21 +29,6 @@ using namespace cute;
 
 namespace vllm {
 
-// A wrapper for the GEMM kernel that is used to guard against compilation on
-// architectures that will never use the kernel. The purpose of this is to
-// reduce the size of the compiled binary.
-// __CUDA_ARCH__ is not defined in host code, so this lets us smuggle the ifdef
-// into code that will be executed on the device where it is defined.
-template <typename Kernel>
-struct enable_sm90_or_later : Kernel {
-  template <typename... Args>
-  CUTLASS_DEVICE void operator()(Args&&... args) {
-#if defined __CUDA_ARCH__ && __CUDA_ARCH__ >= 900
-    Kernel::operator()(std::forward<Args>(args)...);
-#endif
-  }
-};
-
 template <typename ElementAB_, typename ElementD_,
           template <typename, typename, typename> typename Epilogue_,
           typename TileShape, typename ClusterShape, typename KernelSchedule,
@@ -101,60 +83,4 @@ struct cutlass_3x_gemm {
   struct GemmKernel : public KernelType {};
 };
 
-template <typename Gemm, typename... EpilogueArgs>
-void cutlass_gemm_caller(torch::Tensor& out, torch::Tensor const& a,
-                         torch::Tensor const& b,
-                         EpilogueArgs&&... epilogue_params) {
-  using ElementAB = typename Gemm::ElementAB;
-  using ElementD = typename Gemm::ElementD;
-
-  int32_t m = a.size(0);
-  int32_t n = b.size(1);
-  int32_t k = a.size(1);
-
-  int64_t lda = a.stride(0);
-  int64_t ldb = b.stride(1);
-  int64_t ldc = out.stride(0);
-
-  using StrideA = Stride<int64_t, Int<1>, int64_t>;
-  using StrideB = Stride<int64_t, Int<1>, int64_t>;
-  using StrideC = typename Gemm::StrideC;
-
-  StrideA a_stride{lda, Int<1>{}, 0};
-  StrideB b_stride{ldb, Int<1>{}, 0};
-  StrideC c_stride{ldc, Int<1>{}, Int<0>{}};
-
-  using GemmKernel = typename Gemm::GemmKernel;
-  typename GemmKernel::ProblemShape prob_shape{m, n, k, 1};
-
-  auto a_ptr = static_cast<ElementAB*>(a.data_ptr());
-  auto b_ptr = static_cast<ElementAB*>(b.data_ptr());
-  typename GemmKernel::MainloopArguments mainloop_args{a_ptr, a_stride, b_ptr,
-                                                       b_stride};
-
-  auto c_ptr = static_cast<ElementD*>(out.data_ptr());
-  typename GemmKernel::EpilogueArguments epilogue_args{
-      Gemm::Epilogue::prepare_args(
-          std::forward<EpilogueArgs>(epilogue_params)...),
-      c_ptr, c_stride, c_ptr, c_stride};
-
-  typename GemmKernel::Arguments args{cutlass::gemm::GemmUniversalMode::kGemm,
-                                      prob_shape, mainloop_args, epilogue_args};
-
-  // Launch the CUTLASS GEMM kernel.
-  using GemmOp = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
-  GemmOp gemm_op;
-  CUTLASS_CHECK(gemm_op.can_implement(args));
-
-  size_t workspace_size = gemm_op.get_workspace_size(args);
-  auto const workspace_options =
-      torch::TensorOptions().dtype(torch::kUInt8).device(a.device());
-  auto workspace = torch::empty(workspace_size, workspace_options);
-
-  auto stream = at::cuda::getCurrentCUDAStream(a.get_device());
-
-  cutlass::Status status = gemm_op.run(args, workspace.data_ptr(), stream);
-  CUTLASS_CHECK(status);
-}
-
 }  // namespace vllm
diff --git a/csrc/quantization/cutlass_w8a8/c3x/scaled_mm_azp_sm90_int8.cu b/csrc/quantization/cutlass_w8a8/c3x/scaled_mm_azp_sm90_int8.cu
new file mode 100644
index 0000000000000..4cd38f4975df7
--- /dev/null
+++ b/csrc/quantization/cutlass_w8a8/c3x/scaled_mm_azp_sm90_int8.cu
@@ -0,0 +1,24 @@
+#include "scaled_mm_kernels.hpp"
+#include "scaled_mm_sm90_int8_dispatch.cuh"
+#include "cutlass_extensions/epilogue/scaled_mm_epilogues_c3x.hpp"
+
+namespace vllm {
+
+void cutlass_scaled_mm_azp_sm90_int8(torch::Tensor& out, torch::Tensor const& a,
+                                     torch::Tensor const& b,
+                                     torch::Tensor const& a_scales,
+                                     torch::Tensor const& b_scales,
+                                     torch::Tensor const& azp_adj,
+                                     std::optional<torch::Tensor> const& azp,
+                                     std::optional<torch::Tensor> const& bias) {
+  if (azp) {
+    return cutlass_scaled_mm_sm90_int8_epilogue<
+        c3x::ScaledEpilogueBiasAzpToken>(out, a, b, a_scales, b_scales, azp_adj,
+                                         *azp, bias);
+  } else {
+    return cutlass_scaled_mm_sm90_int8_epilogue<c3x::ScaledEpilogueBiasAzp>(
+        out, a, b, a_scales, b_scales, azp_adj, bias);
+  }
+}
+
+}  // namespace vllm
diff --git a/csrc/quantization/cutlass_w8a8/c3x/scaled_mm_blockwise_sm90_fp8.cu b/csrc/quantization/cutlass_w8a8/c3x/scaled_mm_blockwise_sm90_fp8.cu
new file mode 100644
index 0000000000000..0501e6da160e2
--- /dev/null
+++ b/csrc/quantization/cutlass_w8a8/c3x/scaled_mm_blockwise_sm90_fp8.cu
@@ -0,0 +1,24 @@
+
+#include "scaled_mm_kernels.hpp"
+#include "scaled_mm_blockwise_sm90_fp8_dispatch.cuh"
+#include "cutlass_extensions/epilogue/scaled_mm_epilogues_c3x.hpp"
+
+namespace vllm {
+
+void cutlass_scaled_mm_blockwise_sm90_fp8(torch::Tensor& out,
+                                          torch::Tensor const& a,
+                                          torch::Tensor const& b,
+                                          torch::Tensor const& a_scales,
+                                          torch::Tensor const& b_scales) {
+  if (out.dtype() == torch::kBFloat16) {
+    cutlass_gemm_blockwise_sm90_fp8_dispatch<cutlass::bfloat16_t>(
+        out, a, b, a_scales, b_scales);
+
+  } else {
+    TORCH_CHECK(out.dtype() == torch::kFloat16);
+    cutlass_gemm_blockwise_sm90_fp8_dispatch<cutlass::half_t>(
+        out, a, b, a_scales, b_scales);
+  }
+}
+
+}  // namespace vllm
\ No newline at end of file
diff --git a/csrc/quantization/cutlass_w8a8/c3x/scaled_mm_blockwise_sm90_fp8_dispatch.cuh b/csrc/quantization/cutlass_w8a8/c3x/scaled_mm_blockwise_sm90_fp8_dispatch.cuh
new file mode 100644
index 0000000000000..fb7a82b80ee65
--- /dev/null
+++ b/csrc/quantization/cutlass_w8a8/c3x/scaled_mm_blockwise_sm90_fp8_dispatch.cuh
@@ -0,0 +1,168 @@
+#pragma once
+
+#include "cutlass/cutlass.h"
+#include "cutlass/numeric_types.h"
+
+#include "cute/tensor.hpp"
+#include "cutlass/tensor_ref.h"
+#include "cutlass/gemm/dispatch_policy.hpp"
+#include "cutlass/gemm/collective/collective_builder.hpp"
+#include "cutlass/gemm/device/gemm_universal_adapter.h"
+#include "cutlass/gemm/kernel/gemm_universal.hpp"
+#include "cutlass/gemm/kernel/tile_scheduler_params.h"
+#include "cutlass/epilogue/dispatch_policy.hpp"
+#include "cutlass/epilogue/collective/collective_builder.hpp"
+
+#include "cutlass_extensions/gemm/dispatch_policy.hpp"
+#include "cutlass_extensions/gemm/collective/collective_builder.hpp"
+
+#include "cutlass_gemm_caller.cuh"
+
+namespace vllm {
+
+using namespace cute;
+
+template <typename OutType, int GroupSizeM_, int GroupSizeN_, int GroupSizeK_,
+          int TileSizeM_ = 128, class ClusterShape = Shape<_1, _2, _1>>
+struct cutlass_3x_gemm_fp8_blockwise {
+  using GroupSizeM = Int<GroupSizeM_>;
+  using GroupSizeN = Int<GroupSizeN_>;
+  using GroupSizeK = Int<GroupSizeK_>;
+  using TileSizeM = Int<TileSizeM_>;
+
+  static_assert(TileSizeM_ % GroupSizeM_ == 0,
+                "TileSizeM must be a multiple of GroupSizeM");
+
+  using ElementAB = cutlass::float_e4m3_t;
+
+  using ElementA = ElementAB;
+  using LayoutA = cutlass::layout::RowMajor;
+  static constexpr int AlignmentA = 128 / cutlass::sizeof_bits<ElementA>::value;
+
+  using ElementB = ElementAB;
+  using LayoutB = cutlass::layout::ColumnMajor;
+  static constexpr int AlignmentB = 128 / cutlass::sizeof_bits<ElementB>::value;
+
+  using ElementD = OutType;
+  using StrideD = Stride<int64_t, Int<1>, Int<0>>;
+  static constexpr int AlignmentD = 128 / cutlass::sizeof_bits<ElementD>::value;
+
+  using ElementC = void;
+  using StrideC = StrideD;
+  static constexpr int AlignmentC = AlignmentD;
+
+  using ElementAccumulator = float;
+  using ElementBlockScale = float;
+  using ElementCompute = float;
+  using ArchTag = cutlass::arch::Sm90;
+  using OperatorClass = cutlass::arch::OpClassTensorOp;
+  using TileShape = Shape<TileSizeM, GroupSizeN, GroupSizeK>;
+
+  using KernelSchedule = cutlass::gemm::
+      KernelTmaWarpSpecializedCooperativeFP8BlockScaledSubGroupMAccum<
+          GroupSizeM_>;
+  using EpilogueSchedule = cutlass::epilogue::TmaWarpSpecializedCooperative;
+  using EpilogueTileType = cutlass::epilogue::collective::EpilogueTileAuto;
+
+  using StoreEpilogueCompute = typename cutlass::epilogue::fusion::Sm90EVT<
+      cutlass::epilogue::fusion::Sm90AccFetch>;
+
+  using CollectiveEpilogue =
+      typename cutlass::epilogue::collective::CollectiveBuilder<
+          ArchTag, OperatorClass, TileShape, ClusterShape, EpilogueTileType,
+          ElementAccumulator, ElementCompute, ElementC, StrideC, AlignmentC,
+          ElementD, StrideD, AlignmentD, EpilogueSchedule,
+          StoreEpilogueCompute>::CollectiveOp;
+
+  using CollectiveMainloop =
+      typename cutlass::gemm::collective::CollectiveBuilder<
+          ArchTag, OperatorClass, ElementA, LayoutA, AlignmentA, ElementB,
+          LayoutB, AlignmentB, ElementAccumulator, TileShape, ClusterShape,
+          cutlass::gemm::collective::StageCountAutoCarveout<static_cast<int>(
+              sizeof(typename CollectiveEpilogue::SharedStorage))>,
+          KernelSchedule>::CollectiveOp;
+
+  using KernelType = enable_sm90_or_later<cutlass::gemm::kernel::GemmUniversal<
+      Shape<int, int, int, int>, CollectiveMainloop, CollectiveEpilogue,
+      cutlass::gemm::PersistentScheduler>>;
+
+  struct GemmKernel : public KernelType {};
+
+  using StrideA = typename GemmKernel::StrideA;
+  using StrideB = typename GemmKernel::StrideB;
+};
+
+template <typename Gemm>
+void cutlass_gemm_caller_blockwise(torch::Tensor& out, torch::Tensor const& a,
+                                   torch::Tensor const& b,
+                                   torch::Tensor const& a_scales,
+                                   torch::Tensor const& b_scales) {
+  using GemmKernel = typename Gemm::GemmKernel;
+
+  using ElementAB = typename Gemm::ElementAB;
+  using ElementD = typename Gemm::ElementD;
+
+  auto prob_shape = c3x::get_problem_shape(a, b);
+  int32_t m = get<0>(prob_shape), n = get<1>(prob_shape),
+          k = get<2>(prob_shape);
+
+  int64_t lda = a.stride(0);
+  int64_t ldb = b.stride(1);
+  int64_t ldc = out.stride(0);
+
+  using StrideA = Stride<int64_t, Int<1>, int64_t>;
+  using StrideB = Stride<int64_t, Int<1>, int64_t>;
+  using StrideC = typename Gemm::StrideC;
+
+  StrideA a_stride{lda, Int<1>{}, 0};
+  StrideB b_stride{ldb, Int<1>{}, 0};
+  StrideC c_stride{ldc, Int<1>{}, Int<0>{}};
+
+  auto a_ptr = static_cast<ElementAB*>(a.data_ptr());
+  auto b_ptr = static_cast<ElementAB*>(b.data_ptr());
+  auto a_scales_ptr = static_cast<float*>(a_scales.data_ptr());
+  auto b_scales_ptr = static_cast<float*>(b_scales.data_ptr());
+
+  // Check is the t is contiguous and is 1D or 2D with one of the dimensions
+  // being 1 (i.e. a row or column vector)
+  auto is_contiguous_vector = [](const torch::Tensor& t) {
+    auto t_sizes = t.sizes();
+    return t.is_contiguous() &&
+           (t.dim() == 1 ||
+            (t.dim() == 2 &&
+             *std::min_element(t_sizes.begin(), t_sizes.end()) == 1));
+  };
+
+  // TODO(lucas): lets clean-up the kernel so that we pass in Strides so
+  //  we don't have to deal with enforcing implicit layouts
+  TORCH_CHECK(a_scales.size(0) == m / Gemm::GroupSizeM::value);
+  TORCH_CHECK(a_scales.size(1) == k / Gemm::GroupSizeK::value);
+  TORCH_CHECK(a_scales.stride(0) == 1 || is_contiguous_vector(a_scales),
+              "a_scales must be M major");
+  TORCH_CHECK(b_scales.size(0) == k / Gemm::GroupSizeK::value);
+  TORCH_CHECK(b_scales.size(1) == n / Gemm::GroupSizeN::value);
+  TORCH_CHECK(b_scales.stride(0) == 1 || is_contiguous_vector(b_scales),
+              "b_scales must be K major");
+  typename GemmKernel::MainloopArguments mainloop_args{
+      a_ptr, a_stride, b_ptr, b_stride, a_scales_ptr, b_scales_ptr};
+
+  auto c_ptr = static_cast<ElementD*>(out.data_ptr());
+  typename GemmKernel::EpilogueArguments epilogue_args{
+      {}, c_ptr, c_stride, c_ptr, c_stride};
+
+  c3x::cutlass_gemm_caller<GemmKernel>(a.device(), prob_shape, mainloop_args,
+                                       epilogue_args);
+}
+
+template <typename OutType>
+void cutlass_gemm_blockwise_sm90_fp8_dispatch(torch::Tensor& out,
+                                              torch::Tensor const& a,
+                                              torch::Tensor const& b,
+                                              torch::Tensor const& a_scales,
+                                              torch::Tensor const& b_scales) {
+  cutlass_gemm_caller_blockwise<
+      cutlass_3x_gemm_fp8_blockwise<OutType, 1, 128, 128>>(out, a, b, a_scales,
+                                                           b_scales);
+}
+
+}  // namespace vllm
\ No newline at end of file
diff --git a/csrc/quantization/cutlass_w8a8/c3x/scaled_mm_kernels.hpp b/csrc/quantization/cutlass_w8a8/c3x/scaled_mm_kernels.hpp
new file mode 100644
index 0000000000000..7ede9e067477b
--- /dev/null
+++ b/csrc/quantization/cutlass_w8a8/c3x/scaled_mm_kernels.hpp
@@ -0,0 +1,33 @@
+#pragma once
+
+#include <torch/all.h>
+
+namespace vllm {
+
+void cutlass_scaled_mm_sm90_fp8(torch::Tensor& out, torch::Tensor const& a,
+                                torch::Tensor const& b,
+                                torch::Tensor const& a_scales,
+                                torch::Tensor const& b_scales,
+                                std::optional<torch::Tensor> const& bias);
+
+void cutlass_scaled_mm_sm90_int8(torch::Tensor& out, torch::Tensor const& a,
+                                 torch::Tensor const& b,
+                                 torch::Tensor const& a_scales,
+                                 torch::Tensor const& b_scales,
+                                 std::optional<torch::Tensor> const& bias);
+
+void cutlass_scaled_mm_azp_sm90_int8(torch::Tensor& out, torch::Tensor const& a,
+                                     torch::Tensor const& b,
+                                     torch::Tensor const& a_scales,
+                                     torch::Tensor const& b_scales,
+                                     torch::Tensor const& azp_adj,
+                                     std::optional<torch::Tensor> const& azp,
+                                     std::optional<torch::Tensor> const& bias);
+
+void cutlass_scaled_mm_blockwise_sm90_fp8(torch::Tensor& out,
+                                          torch::Tensor const& a,
+                                          torch::Tensor const& b,
+                                          torch::Tensor const& a_scales,
+                                          torch::Tensor const& b_scales);
+
+}  // namespace vllm
diff --git a/csrc/quantization/cutlass_w8a8/c3x/scaled_mm_sm90_fp8.cu b/csrc/quantization/cutlass_w8a8/c3x/scaled_mm_sm90_fp8.cu
new file mode 100644
index 0000000000000..e092c61abc249
--- /dev/null
+++ b/csrc/quantization/cutlass_w8a8/c3x/scaled_mm_sm90_fp8.cu
@@ -0,0 +1,24 @@
+#include "scaled_mm_kernels.hpp"
+#include "scaled_mm_sm90_fp8_dispatch.cuh"
+#include "cutlass_extensions/epilogue/scaled_mm_epilogues_c3x.hpp"
+
+namespace vllm {
+
+void cutlass_scaled_mm_sm90_fp8(torch::Tensor& out, torch::Tensor const& a,
+                                torch::Tensor const& b,
+                                torch::Tensor const& a_scales,
+                                torch::Tensor const& b_scales,
+                                std::optional<torch::Tensor> const& bias) {
+  TORCH_CHECK(a_scales.is_contiguous() && b_scales.is_contiguous());
+  if (bias) {
+    TORCH_CHECK(bias->dtype() == out.dtype(),
+                "currently bias dtype must match output dtype ", out.dtype());
+    return cutlass_scaled_mm_sm90_fp8_epilogue<c3x::ScaledEpilogueBias>(
+        out, a, b, a_scales, b_scales, *bias);
+  } else {
+    return cutlass_scaled_mm_sm90_fp8_epilogue<c3x::ScaledEpilogue>(
+        out, a, b, a_scales, b_scales);
+  }
+}
+
+}  // namespace vllm
diff --git a/csrc/quantization/cutlass_w8a8/scaled_mm_c3x_sm90_fp8_dispatch.cuh b/csrc/quantization/cutlass_w8a8/c3x/scaled_mm_sm90_fp8_dispatch.cuh
similarity index 76%
rename from csrc/quantization/cutlass_w8a8/scaled_mm_c3x_sm90_fp8_dispatch.cuh
rename to csrc/quantization/cutlass_w8a8/c3x/scaled_mm_sm90_fp8_dispatch.cuh
index f08419b3122b2..32ea5db3321bc 100644
--- a/csrc/quantization/cutlass_w8a8/scaled_mm_c3x_sm90_fp8_dispatch.cuh
+++ b/csrc/quantization/cutlass_w8a8/c3x/scaled_mm_sm90_fp8_dispatch.cuh
@@ -1,6 +1,7 @@
 #pragma once
 
-#include "scaled_mm_c3x.cuh"
+#include "scaled_mm.cuh"
+#include "cutlass_gemm_caller.cuh"
 
 /**
  * This file defines Gemm kernel configurations for SM90 (fp8) based on the Gemm
@@ -9,6 +10,8 @@
 
 namespace vllm {
 
+using c3x::cutlass_gemm_caller;
+
 template <typename InType, typename OutType,
           template <typename, typename, typename> typename Epilogue>
 struct sm90_fp8_config_default {
@@ -93,4 +96,25 @@ inline void cutlass_gemm_sm90_fp8_dispatch(torch::Tensor& out,
   }
 }
 
+template <template <typename, typename, typename> typename Epilogue,
+          typename... EpilogueArgs>
+void cutlass_scaled_mm_sm90_fp8_epilogue(torch::Tensor& out,
+                                         torch::Tensor const& a,
+                                         torch::Tensor const& b,
+                                         EpilogueArgs&&... epilogue_args) {
+  TORCH_CHECK(a.dtype() == torch::kFloat8_e4m3fn);
+  TORCH_CHECK(b.dtype() == torch::kFloat8_e4m3fn);
+
+  if (out.dtype() == torch::kBFloat16) {
+    return cutlass_gemm_sm90_fp8_dispatch<cutlass::float_e4m3_t,
+                                          cutlass::bfloat16_t, Epilogue>(
+        out, a, b, std::forward<EpilogueArgs>(epilogue_args)...);
+  } else {
+    TORCH_CHECK(out.dtype() == torch::kFloat16);
+    return cutlass_gemm_sm90_fp8_dispatch<cutlass::float_e4m3_t,
+                                          cutlass::half_t, Epilogue>(
+        out, a, b, std::forward<EpilogueArgs>(epilogue_args)...);
+  }
+}
+
 }  // namespace vllm
\ No newline at end of file
diff --git a/csrc/quantization/cutlass_w8a8/c3x/scaled_mm_sm90_int8.cu b/csrc/quantization/cutlass_w8a8/c3x/scaled_mm_sm90_int8.cu
new file mode 100644
index 0000000000000..021467b8bde8f
--- /dev/null
+++ b/csrc/quantization/cutlass_w8a8/c3x/scaled_mm_sm90_int8.cu
@@ -0,0 +1,24 @@
+#include "scaled_mm_kernels.hpp"
+#include "scaled_mm_sm90_int8_dispatch.cuh"
+#include "cutlass_extensions/epilogue/scaled_mm_epilogues_c3x.hpp"
+
+namespace vllm {
+
+void cutlass_scaled_mm_sm90_int8(torch::Tensor& out, torch::Tensor const& a,
+                                 torch::Tensor const& b,
+                                 torch::Tensor const& a_scales,
+                                 torch::Tensor const& b_scales,
+                                 std::optional<torch::Tensor> const& bias) {
+  TORCH_CHECK(a_scales.is_contiguous() && b_scales.is_contiguous());
+  if (bias) {
+    TORCH_CHECK(bias->dtype() == out.dtype(),
+                "currently bias dtype must match output dtype ", out.dtype());
+    return cutlass_scaled_mm_sm90_int8_epilogue<c3x::ScaledEpilogueBias>(
+        out, a, b, a_scales, b_scales, *bias);
+  } else {
+    return cutlass_scaled_mm_sm90_int8_epilogue<c3x::ScaledEpilogue>(
+        out, a, b, a_scales, b_scales);
+  }
+}
+
+}  // namespace vllm
diff --git a/csrc/quantization/cutlass_w8a8/scaled_mm_c3x_sm90_int8_dispatch.cuh b/csrc/quantization/cutlass_w8a8/c3x/scaled_mm_sm90_int8_dispatch.cuh
similarity index 84%
rename from csrc/quantization/cutlass_w8a8/scaled_mm_c3x_sm90_int8_dispatch.cuh
rename to csrc/quantization/cutlass_w8a8/c3x/scaled_mm_sm90_int8_dispatch.cuh
index 34e5fd90ba26a..c4fa18101956b 100644
--- a/csrc/quantization/cutlass_w8a8/scaled_mm_c3x_sm90_int8_dispatch.cuh
+++ b/csrc/quantization/cutlass_w8a8/c3x/scaled_mm_sm90_int8_dispatch.cuh
@@ -1,6 +1,7 @@
 #pragma once
 
-#include "scaled_mm_c3x.cuh"
+#include "scaled_mm.cuh"
+#include "cutlass_gemm_caller.cuh"
 
 /**
  * This file defines Gemm kernel configurations for SM90 (int8) based on the
@@ -9,6 +10,8 @@
 
 namespace vllm {
 
+using c3x::cutlass_gemm_caller;
+
 template <typename InType, typename OutType,
           template <typename, typename, typename> typename Epilogue>
 struct sm90_int8_config_default {
@@ -137,4 +140,24 @@ inline void cutlass_gemm_sm90_int8_dispatch(torch::Tensor& out,
   }
 }
 
+template <template <typename, typename, typename> typename Epilogue,
+          typename... EpilogueArgs>
+void cutlass_scaled_mm_sm90_int8_epilogue(torch::Tensor& out,
+                                          torch::Tensor const& a,
+                                          torch::Tensor const& b,
+                                          EpilogueArgs&&... epilogue_args) {
+  TORCH_CHECK(a.dtype() == torch::kInt8);
+  TORCH_CHECK(b.dtype() == torch::kInt8);
+
+  if (out.dtype() == torch::kBFloat16) {
+    return cutlass_gemm_sm90_int8_dispatch<int8_t, cutlass::bfloat16_t,
+                                           Epilogue>(
+        out, a, b, std::forward<EpilogueArgs>(epilogue_args)...);
+  } else {
+    TORCH_CHECK(out.dtype() == torch::kFloat16);
+    return cutlass_gemm_sm90_int8_dispatch<int8_t, cutlass::half_t, Epilogue>(
+        out, a, b, std::forward<EpilogueArgs>(epilogue_args)...);
+  }
+}
+
 }  // namespace vllm
\ No newline at end of file
diff --git a/csrc/quantization/cutlass_w8a8/scaled_mm_c3x.cu b/csrc/quantization/cutlass_w8a8/scaled_mm_c3x.cu
index e18d7d79e5b77..72d549e597df5 100644
--- a/csrc/quantization/cutlass_w8a8/scaled_mm_c3x.cu
+++ b/csrc/quantization/cutlass_w8a8/scaled_mm_c3x.cu
@@ -1,52 +1,13 @@
 #include <cudaTypedefs.h>
+#include "c3x/scaled_mm_kernels.hpp"
 
-#if defined CUDA_VERSION && CUDA_VERSION >= 12000
-
-  #include "scaled_mm_c3x_sm90_fp8_dispatch.cuh"
-  #include "scaled_mm_c3x_sm90_int8_dispatch.cuh"
-
-  #include "cutlass_extensions/epilogue/scaled_mm_epilogues_c3x.hpp"
-using namespace vllm;
+#include "core/math.hpp"
 
 /*
    This file defines quantized GEMM operations using the CUTLASS 3.x API, for
    NVIDIA GPUs with sm90a (Hopper) or later.
 */
 
-template <template <typename, typename, typename> typename Epilogue,
-          typename... EpilogueArgs>
-void cutlass_scaled_mm_sm90_epilogue(torch::Tensor& out, torch::Tensor const& a,
-                                     torch::Tensor const& b,
-                                     EpilogueArgs&&... epilogue_args) {
-  if (a.dtype() == torch::kInt8) {
-    TORCH_CHECK(b.dtype() == torch::kInt8);
-
-    if (out.dtype() == torch::kBFloat16) {
-      return cutlass_gemm_sm90_int8_dispatch<int8_t, cutlass::bfloat16_t,
-                                             Epilogue>(
-          out, a, b, std::forward<EpilogueArgs>(epilogue_args)...);
-    } else {
-      TORCH_CHECK(out.dtype() == torch::kFloat16);
-      return cutlass_gemm_sm90_int8_dispatch<int8_t, cutlass::half_t, Epilogue>(
-          out, a, b, std::forward<EpilogueArgs>(epilogue_args)...);
-    }
-  } else {
-    TORCH_CHECK(a.dtype() == torch::kFloat8_e4m3fn);
-    TORCH_CHECK(b.dtype() == torch::kFloat8_e4m3fn);
-
-    if (out.dtype() == torch::kBFloat16) {
-      return cutlass_gemm_sm90_fp8_dispatch<cutlass::float_e4m3_t,
-                                            cutlass::bfloat16_t, Epilogue>(
-          out, a, b, std::forward<EpilogueArgs>(epilogue_args)...);
-    } else {
-      TORCH_CHECK(out.dtype() == torch::kFloat16);
-      return cutlass_gemm_sm90_fp8_dispatch<cutlass::float_e4m3_t,
-                                            cutlass::half_t, Epilogue>(
-          out, a, b, std::forward<EpilogueArgs>(epilogue_args)...);
-    }
-  }
-}
-
 void cutlass_scaled_mm_sm90(torch::Tensor& c, torch::Tensor const& a,
                             torch::Tensor const& b,
                             torch::Tensor const& a_scales,
@@ -54,14 +15,56 @@ void cutlass_scaled_mm_sm90(torch::Tensor& c, torch::Tensor const& a,
                             std::optional<torch::Tensor> const& bias) {
   TORCH_CHECK(a_scales.dtype() == torch::kFloat32);
   TORCH_CHECK(b_scales.dtype() == torch::kFloat32);
-  if (bias) {
-    TORCH_CHECK(bias->dtype() == c.dtype(),
-                "currently bias dtype must match output dtype ", c.dtype());
-    return cutlass_scaled_mm_sm90_epilogue<c3x::ScaledEpilogueBias>(
-        c, a, b, a_scales, b_scales, *bias);
+
+  using GroupShape = std::array<int64_t, 2>;
+
+  int M = a.size(0), N = b.size(1), K = a.size(1);
+
+  GroupShape a_scale_group_shape = [&, &s = a_scales]() -> GroupShape {
+    if (s.numel() == 1) return {M, K};  // tensor-wise
+    if (s.dim() == 2)
+      return {ceil_div(a.size(0), s.size(0)), ceil_div(a.size(1), s.size(1))};
+    TORCH_CHECK(false, "Unsupported scale shape for scale_a");
+  }();
+
+  GroupShape b_scale_group_shape = [&, &s = b_scales]() -> GroupShape {
+    if (s.numel() == 1) return {K, N};  // tensor-wise
+    if (s.dim() == 2)
+      return {ceil_div(b.size(0), s.size(0)), ceil_div(b.size(1), s.size(1))};
+    TORCH_CHECK(false, "Unsupported scale shape for scale_b");
+  }();
+
+  if ((a_scale_group_shape == GroupShape{M, K} ||
+       a_scale_group_shape == GroupShape{1, K}) &&
+      (b_scale_group_shape == GroupShape{K, N} ||
+       b_scale_group_shape == GroupShape{K, 1})) {
+    // "standard per-tensor/per-token/per-channel" scaling
+    TORCH_CHECK(a_scales.is_contiguous() && b_scales.is_contiguous());
+    if (a.dtype() == torch::kFloat8_e4m3fn) {
+      vllm::cutlass_scaled_mm_sm90_fp8(c, a, b, a_scales, b_scales, bias);
+    } else {
+      TORCH_CHECK(a.dtype() == torch::kInt8);
+      vllm::cutlass_scaled_mm_sm90_int8(c, a, b, a_scales, b_scales, bias);
+    }
+  } else if (a_scale_group_shape == GroupShape{1, 128} &&
+             b_scale_group_shape == GroupShape{128, 128}) {
+    // 1x128 per-token group scales for activations
+    // 128x128 blockwise scales for weights
+    TORCH_CHECK(a.dtype() == torch::kFloat8_e4m3fn &&
+                    b.dtype() == torch::kFloat8_e4m3fn,
+                "Currently only FP8 is supported for A group shape 1x128 and "
+                "B group shape 128x128");
+    TORCH_CHECK(!bias, "Bias not yet supported blockwise scaled_mm");
+
+    vllm::cutlass_scaled_mm_blockwise_sm90_fp8(c, a, b, a_scales, b_scales);
   } else {
-    return cutlass_scaled_mm_sm90_epilogue<c3x::ScaledEpilogue>(
-        c, a, b, a_scales, b_scales);
+    TORCH_CHECK(false,
+                "Unsupported scale group shapes for CUTLASS 3.x GEMM.\n "
+                "a_scale_group_shape must be [1, 128], got: [",
+                a_scale_group_shape[0], ", ", a_scale_group_shape[1],
+                "]\n"
+                "b_scale_group_shape must be [128, 128], got: [",
+                b_scale_group_shape[0], ", ", b_scale_group_shape[1], "]");
   }
 }
 
@@ -75,13 +78,6 @@ void cutlass_scaled_mm_azp_sm90(torch::Tensor& out, torch::Tensor const& a,
   TORCH_CHECK(a_scales.dtype() == torch::kFloat32);
   TORCH_CHECK(b_scales.dtype() == torch::kFloat32);
 
-  if (azp) {
-    return cutlass_scaled_mm_sm90_epilogue<c3x::ScaledEpilogueBiasAzpToken>(
-        out, a, b, a_scales, b_scales, azp_adj, *azp, bias);
-  } else {
-    return cutlass_scaled_mm_sm90_epilogue<c3x::ScaledEpilogueBiasAzp>(
-        out, a, b, a_scales, b_scales, azp_adj, bias);
-  }
+  vllm::cutlass_scaled_mm_azp_sm90_int8(out, a, b, a_scales, b_scales, azp_adj,
+                                        azp, bias);
 }
-
-#endif
diff --git a/csrc/quantization/cutlass_w8a8/scaled_mm_entry.cu b/csrc/quantization/cutlass_w8a8/scaled_mm_entry.cu
index 3f2b52624f366..6bef55088682a 100644
--- a/csrc/quantization/cutlass_w8a8/scaled_mm_entry.cu
+++ b/csrc/quantization/cutlass_w8a8/scaled_mm_entry.cu
@@ -81,6 +81,19 @@ bool cutlass_scaled_mm_supports_fp8(int64_t cuda_device_capability) {
   return false;
 }
 
+bool cutlass_scaled_mm_supports_block_fp8(int64_t cuda_device_capability) {
+  // CUTLASS block-quantized FP8 kernels need at least CUDA 12.0
+  // and at least SM90 (Hopper)
+
+#if defined CUDA_VERSION
+  if (cuda_device_capability >= 90) {
+    return CUDA_VERSION >= 12000;
+  }
+#endif
+
+  return false;
+}
+
 void cutlass_scaled_mm(torch::Tensor& c, torch::Tensor const& a,
                        torch::Tensor const& b, torch::Tensor const& a_scales,
                        torch::Tensor const& b_scales,
@@ -89,15 +102,12 @@ void cutlass_scaled_mm(torch::Tensor& c, torch::Tensor const& a,
   TORCH_CHECK(a.dim() == 2 && b.dim() == 2 && c.dim() == 2);
   TORCH_CHECK(c.size(0) == a.size(0) && a.size(1) == b.size(0) &&
               b.size(1) == c.size(1));
-  TORCH_CHECK(a_scales.numel() == 1 || a_scales.numel() == a.size(0));
-  TORCH_CHECK(b_scales.numel() == 1 || b_scales.numel() == b.size(1));
 
   // Check for strides and alignment
   TORCH_CHECK(a.stride(1) == 1 && c.stride(1) == 1);  // Row-major
   TORCH_CHECK(b.stride(0) == 1);                      // Column-major
   TORCH_CHECK(c.stride(0) % 16 == 0 &&
               b.stride(1) % 16 == 0);  // 16 Byte Alignment
-  TORCH_CHECK(a_scales.is_contiguous() && b_scales.is_contiguous());
 
   if (bias) {
     TORCH_CHECK(bias->numel() == b.size(1) && bias->is_contiguous() &&
@@ -215,4 +225,4 @@ void cutlass_scaled_mm_azp(torch::Tensor& c, torch::Tensor const& a,
       "No compiled cutlass_scaled_mm_azp for a compute capability less than "
       "CUDA device capability: ",
       version_num);
-}
\ No newline at end of file
+}
diff --git a/csrc/quantization/gptq_marlin/gptq_marlin.cu b/csrc/quantization/gptq_marlin/gptq_marlin.cu
index 04ef842fbdf95..7c33fea93d6ae 100644
--- a/csrc/quantization/gptq_marlin/gptq_marlin.cu
+++ b/csrc/quantization/gptq_marlin/gptq_marlin.cu
@@ -173,8 +173,8 @@ dequant<half, vllm::kU4B8.id()>(int q) {
   const int HI = 0x00f000f0;
   const int EX = 0x64006400;
   // Guarantee that the `(a & b) | c` operations are LOP3s.
-  int lo = lop3<(0xf0 & 0xcc) | 0xaa>(q, LO, EX);
-  int hi = lop3<(0xf0 & 0xcc) | 0xaa>(q, HI, EX);
+  int lo = lop3 < (0xf0 & 0xcc) | 0xaa > (q, LO, EX);
+  int hi = lop3 < (0xf0 & 0xcc) | 0xaa > (q, HI, EX);
   // We want signed int4 outputs, hence we fuse the `-8` symmetric zero point
   // directly into `SUB` and `ADD`.
   const int SUB = 0x64086408;
@@ -197,9 +197,9 @@ dequant<nv_bfloat16, vllm::kU4B8.id()>(int q) {
 
   // Guarantee that the `(a & b) | c` operations are LOP3s.
 
-  int lo = lop3<(0xf0 & 0xcc) | 0xaa>(q, MASK, EX);
+  int lo = lop3 < (0xf0 & 0xcc) | 0xaa > (q, MASK, EX);
   q >>= 4;
-  int hi = lop3<(0xf0 & 0xcc) | 0xaa>(q, MASK, EX);
+  int hi = lop3 < (0xf0 & 0xcc) | 0xaa > (q, MASK, EX);
 
   typename ScalarType<nv_bfloat16>::FragB frag_b;
   static constexpr uint32_t MUL = 0x3F803F80;
@@ -221,8 +221,8 @@ dequant<half, vllm::kU4.id()>(int q) {
   const int HI = 0x00f000f0;
   const int EX = 0x64006400;
   // Guarantee that the `(a & b) | c` operations are LOP3s.
-  int lo = lop3<(0xf0 & 0xcc) | 0xaa>(q, LO, EX);
-  int hi = lop3<(0xf0 & 0xcc) | 0xaa>(q, HI, EX);
+  int lo = lop3 < (0xf0 & 0xcc) | 0xaa > (q, LO, EX);
+  int hi = lop3 < (0xf0 & 0xcc) | 0xaa > (q, HI, EX);
 
   const int SUB = 0x64006400;
   const int MUL = 0x2c002c00;
@@ -244,9 +244,9 @@ dequant<nv_bfloat16, vllm::kU4.id()>(int q) {
 
   // Guarantee that the `(a & b) | c` operations are LOP3s.
 
-  int lo = lop3<(0xf0 & 0xcc) | 0xaa>(q, MASK, EX);
+  int lo = lop3 < (0xf0 & 0xcc) | 0xaa > (q, MASK, EX);
   q >>= 4;
-  int hi = lop3<(0xf0 & 0xcc) | 0xaa>(q, MASK, EX);
+  int hi = lop3 < (0xf0 & 0xcc) | 0xaa > (q, MASK, EX);
 
   typename ScalarType<nv_bfloat16>::FragB frag_b;
   static constexpr uint32_t MUL = 0x3F803F80;
diff --git a/csrc/quantization/machete/generate.py b/csrc/quantization/machete/generate.py
index a9b5ddf4cbdd2..02e59fe28b9af 100644
--- a/csrc/quantization/machete/generate.py
+++ b/csrc/quantization/machete/generate.py
@@ -1,3 +1,5 @@
+# SPDX-License-Identifier: Apache-2.0
+
 import itertools
 import math
 import os
diff --git a/csrc/quantization/machete/machete_mainloop.cuh b/csrc/quantization/machete/machete_mainloop.cuh
index 4071b19a3564d..572894064dc59 100644
--- a/csrc/quantization/machete/machete_mainloop.cuh
+++ b/csrc/quantization/machete/machete_mainloop.cuh
@@ -272,6 +272,10 @@ struct MacheteCollectiveMma {
   using PipelineState = cutlass::PipelineState<DispatchPolicy::Stages>;
 
   using PipelineParams = typename MainloopPipeline::Params;
+
+  // One threads per CTA are producers (1 for operand tile)
+  static constexpr int NumProducerThreadEvents = 1;
+
   using ScaleTileShape = decltype(make_shape(shape<0>(TileShape{}),
                                              shape<1>(SmemLayoutAtomScale{})));
 
diff --git a/csrc/quantization/marlin/dense/marlin_cuda_kernel.cu b/csrc/quantization/marlin/dense/marlin_cuda_kernel.cu
index c03fef886e4db..4db8f5dcdabf6 100644
--- a/csrc/quantization/marlin/dense/marlin_cuda_kernel.cu
+++ b/csrc/quantization/marlin/dense/marlin_cuda_kernel.cu
@@ -96,8 +96,8 @@ __device__ inline FragB dequant(int q) {
   const int HI = 0x00f000f0;
   const int EX = 0x64006400;
   // Guarantee that the `(a & b) | c` operations are LOP3s.
-  int lo = lop3<(0xf0 & 0xcc) | 0xaa>(q, LO, EX);
-  int hi = lop3<(0xf0 & 0xcc) | 0xaa>(q, HI, EX);
+  int lo = lop3 < (0xf0 & 0xcc) | 0xaa > (q, LO, EX);
+  int hi = lop3 < (0xf0 & 0xcc) | 0xaa > (q, HI, EX);
   // We want signed int4 outputs, hence we fuse the `-8` symmetric zero point
   // directly into `SUB` and `ADD`.
   const int SUB = 0x64086408;
diff --git a/csrc/quantization/marlin/qqq/marlin_qqq_gemm_kernel.cu b/csrc/quantization/marlin/qqq/marlin_qqq_gemm_kernel.cu
index 103a6444f3a21..048a3f736fb71 100644
--- a/csrc/quantization/marlin/qqq/marlin_qqq_gemm_kernel.cu
+++ b/csrc/quantization/marlin/qqq/marlin_qqq_gemm_kernel.cu
@@ -141,8 +141,8 @@ __device__ inline FragB dequant_per_group(int q, FragS_GROUP& frag_s, int i) {
   static constexpr uint32_t HI = 0x00f000f0;
   static constexpr uint32_t EX = 0x64006400;
   // Guarantee that the `(a & b) | c` operations are LOP3s.
-  uint32_t t0 = lop3<(0xf0 & 0xcc) | 0xaa>(q, LO, EX);
-  uint32_t t1 = lop3<(0xf0 & 0xcc) | 0xaa>(q, HI, EX);
+  uint32_t t0 = lop3 < (0xf0 & 0xcc) | 0xaa > (q, LO, EX);
+  uint32_t t1 = lop3 < (0xf0 & 0xcc) | 0xaa > (q, HI, EX);
   // We want signed int4 outputs, hence we fuse the `-8` symmetric zero point
   // directly into `SUB` and `ADD`.
   static constexpr uint32_t SUB = 0x64086408;
diff --git a/csrc/quantization/marlin/sparse/common/mma.h b/csrc/quantization/marlin/sparse/common/mma.h
index b26505f771c8b..49eee4128ee7c 100644
--- a/csrc/quantization/marlin/sparse/common/mma.h
+++ b/csrc/quantization/marlin/sparse/common/mma.h
@@ -127,8 +127,8 @@ __device__ inline FragB dequant_4bit(int q) {
   const int HI = 0x00f000f0;
   const int EX = 0x64006400;
   // Guarantee that the `(a & b) | c` operations are LOP3s.
-  int lo = lop3<(0xf0 & 0xcc) | 0xaa>(q, LO, EX);
-  int hi = lop3<(0xf0 & 0xcc) | 0xaa>(q, HI, EX);
+  int lo = lop3 < (0xf0 & 0xcc) | 0xaa > (q, LO, EX);
+  int hi = lop3 < (0xf0 & 0xcc) | 0xaa > (q, HI, EX);
   // We want signed int4 outputs, hence we fuse the `-8` symmetric zero point
   // directly into `SUB` and `ADD`.
   const int SUB = 0x64086408;
diff --git a/csrc/rocm/attention.cu b/csrc/rocm/attention.cu
index 9477790629c9f..ffa9d44610a7f 100644
--- a/csrc/rocm/attention.cu
+++ b/csrc/rocm/attention.cu
@@ -907,7 +907,9 @@ __launch_bounds__(NUM_THREADS) void paged_attention_ll4mi_reduce_kernel(
     const scalar_t* __restrict__ tmp_out,  // [num_seqs, num_heads,
                                            // max_num_partitions, head_size]
     const int* __restrict__ context_lens,  // [num_seqs]
-    const int max_num_partitions){UNREACHABLE_CODE}
+    const int max_num_partitions) {
+  UNREACHABLE_CODE
+}
 
 #endif  // defined(__HIP__MI300_MI250__) TODO: Add NAVI support
 
diff --git a/csrc/torch_bindings.cpp b/csrc/torch_bindings.cpp
index 5f78ac0393fae..46a3225262c73 100644
--- a/csrc/torch_bindings.cpp
+++ b/csrc/torch_bindings.cpp
@@ -324,6 +324,13 @@ TORCH_LIBRARY_EXPAND(TORCH_EXTENSION_NAME, ops) {
   ops.def("cutlass_scaled_mm_supports_fp8(int cuda_device_capability) -> bool");
   ops.impl("cutlass_scaled_mm_supports_fp8", &cutlass_scaled_mm_supports_fp8);
 
+  // Check if cutlass scaled_mm supports block quantization (used by DeepSeekV3)
+  ops.def(
+      "cutlass_scaled_mm_supports_block_fp8(int cuda_device_capability) -> "
+      "bool");
+  ops.impl("cutlass_scaled_mm_supports_block_fp8",
+           &cutlass_scaled_mm_supports_fp8);
+
   // Check if cutlass sparse scaled_mm is supported for CUDA devices of the
   // given capability
   ops.def(
@@ -475,6 +482,15 @@ TORCH_LIBRARY_EXPAND(CONCAT(TORCH_EXTENSION_NAME, _cache_ops), cache_ops) {
   cache_ops.impl("reshape_and_cache_flash_full_cuda", torch::kCUDA,
                  &reshape_and_cache_flash_full_cuda);
 
+  // Concat kv_c and k_pe and cache them.
+  cache_ops.def(
+      "concat_and_cache_mla(Tensor kv_c, Tensor k_pe,"
+      "                     Tensor! kv_cache,"
+      "                     Tensor slot_mapping,"
+      "                     str kv_cache_dtype,"
+      "                     Tensor scale) -> ()");
+  cache_ops.impl("concat_and_cache_mla", torch::kCUDA, &concat_and_cache_mla);
+
   // Convert the key and value cache to fp8 data type.
   cache_ops.def(
       "convert_fp8(Tensor! dst_cache, Tensor src_cache, float scale, "
diff --git a/docs/requirements-docs.txt b/docs/requirements-docs.txt
index 8217bc3ba3ded..1d669699f4b2a 100644
--- a/docs/requirements-docs.txt
+++ b/docs/requirements-docs.txt
@@ -1,10 +1,10 @@
 sphinx==6.2.1
+sphinx-argparse==0.4.0
 sphinx-book-theme==1.0.1
 sphinx-copybutton==0.5.2
-myst-parser==3.0.1
-sphinx-argparse==0.4.0
 sphinx-design==0.6.1
 sphinx-togglebutton==0.3.2
+myst-parser==3.0.1
 msgspec
 cloudpickle
 
diff --git a/docs/source/_static/custom.js b/docs/source/_static/custom.js
index 18b502c786e1d..be0b2a388e404 100644
--- a/docs/source/_static/custom.js
+++ b/docs/source/_static/custom.js
@@ -1,3 +1,4 @@
+// Add RunLLM widget
 document.addEventListener("DOMContentLoaded", function () {
     var script = document.createElement("script");
     script.type = "module";
@@ -15,4 +16,23 @@ document.addEventListener("DOMContentLoaded", function () {
   
     script.async = true;
     document.head.appendChild(script);
-  });
\ No newline at end of file
+  });
+
+// Update URL search params when tab is clicked
+  document.addEventListener("DOMContentLoaded", function () {
+    const tabs = document.querySelectorAll(".sd-tab-label");
+
+    function updateURL(tab) {
+      const syncGroup = tab.getAttribute("data-sync-group");
+      const syncId = tab.getAttribute("data-sync-id");
+      if (syncGroup && syncId) {
+          const url = new URL(window.location);
+          url.searchParams.set(syncGroup, syncId);
+          window.history.replaceState(null, "", url);
+      }
+    }
+
+    tabs.forEach(tab => {
+        tab.addEventListener("click", () => updateURL(tab));
+    });
+});
diff --git a/docs/source/api/engine/index.md b/docs/source/api/engine/index.md
index 701cb95d3be33..b6544d94afdf8 100644
--- a/docs/source/api/engine/index.md
+++ b/docs/source/api/engine/index.md
@@ -8,10 +8,10 @@
 .. currentmodule:: vllm.engine
 ```
 
-```{toctree}
+:::{toctree}
 :caption: Engines
 :maxdepth: 2
 
 llm_engine
 async_llm_engine
-```
+:::
diff --git a/docs/source/api/model/index.md b/docs/source/api/model/index.md
index 113792147be7c..8fee3a55c93de 100644
--- a/docs/source/api/model/index.md
+++ b/docs/source/api/model/index.md
@@ -2,10 +2,10 @@
 
 ## Submodules
 
-```{toctree}
+:::{toctree}
 :maxdepth: 1
 
 interfaces_base
 interfaces
 adapters
-```
+:::
diff --git a/docs/source/api/multimodal/index.md b/docs/source/api/multimodal/index.md
index 14efdb506d76f..069ed53e545c5 100644
--- a/docs/source/api/multimodal/index.md
+++ b/docs/source/api/multimodal/index.md
@@ -17,7 +17,7 @@ Looking to add your own multi-modal model? Please follow the instructions listed
 
 ## Submodules
 
-```{toctree}
+:::{toctree}
 :maxdepth: 1
 
 inputs
@@ -25,4 +25,4 @@ parse
 processing
 profiling
 registry
-```
+:::
diff --git a/docs/source/api/offline_inference/index.md b/docs/source/api/offline_inference/index.md
index c32f99d59e3db..ec2cc599d923c 100644
--- a/docs/source/api/offline_inference/index.md
+++ b/docs/source/api/offline_inference/index.md
@@ -1,9 +1,9 @@
 # Offline Inference
 
-```{toctree}
+:::{toctree}
 :caption: Contents
 :maxdepth: 1
 
 llm
 llm_inputs
-```
+:::
diff --git a/docs/source/assets/design/v1/prefix_caching/example-time-1.png b/docs/source/assets/design/v1/prefix_caching/example-time-1.png
new file mode 100644
index 0000000000000..8849ca0237c39
Binary files /dev/null and b/docs/source/assets/design/v1/prefix_caching/example-time-1.png differ
diff --git a/docs/source/assets/design/v1/prefix_caching/example-time-3.png b/docs/source/assets/design/v1/prefix_caching/example-time-3.png
new file mode 100644
index 0000000000000..71b9e9b60ab9a
Binary files /dev/null and b/docs/source/assets/design/v1/prefix_caching/example-time-3.png differ
diff --git a/docs/source/assets/design/v1/prefix_caching/example-time-4.png b/docs/source/assets/design/v1/prefix_caching/example-time-4.png
new file mode 100644
index 0000000000000..017df1657c22e
Binary files /dev/null and b/docs/source/assets/design/v1/prefix_caching/example-time-4.png differ
diff --git a/docs/source/assets/design/v1/prefix_caching/example-time-5.png b/docs/source/assets/design/v1/prefix_caching/example-time-5.png
new file mode 100644
index 0000000000000..b80dd5b9949dc
Binary files /dev/null and b/docs/source/assets/design/v1/prefix_caching/example-time-5.png differ
diff --git a/docs/source/assets/design/v1/prefix_caching/example-time-6.png b/docs/source/assets/design/v1/prefix_caching/example-time-6.png
new file mode 100644
index 0000000000000..fbd7138596e89
Binary files /dev/null and b/docs/source/assets/design/v1/prefix_caching/example-time-6.png differ
diff --git a/docs/source/assets/design/v1/prefix_caching/example-time-7.png b/docs/source/assets/design/v1/prefix_caching/example-time-7.png
new file mode 100644
index 0000000000000..fc33ef50d4fdb
Binary files /dev/null and b/docs/source/assets/design/v1/prefix_caching/example-time-7.png differ
diff --git a/docs/source/assets/design/v1/prefix_caching/free.png b/docs/source/assets/design/v1/prefix_caching/free.png
new file mode 100644
index 0000000000000..cbc2f22222e04
Binary files /dev/null and b/docs/source/assets/design/v1/prefix_caching/free.png differ
diff --git a/docs/source/assets/design/v1/prefix_caching/overview.png b/docs/source/assets/design/v1/prefix_caching/overview.png
new file mode 100644
index 0000000000000..14fb985adca03
Binary files /dev/null and b/docs/source/assets/design/v1/prefix_caching/overview.png differ
diff --git a/docs/source/assets/logos/vllm-logo-only-light.ico b/docs/source/assets/logos/vllm-logo-only-light.ico
new file mode 100644
index 0000000000000..27528ceebfff4
Binary files /dev/null and b/docs/source/assets/logos/vllm-logo-only-light.ico differ
diff --git a/docs/source/community/blog.md b/docs/source/community/blog.md
new file mode 100644
index 0000000000000..e8030edfa02ee
--- /dev/null
+++ b/docs/source/community/blog.md
@@ -0,0 +1,3 @@
+# vLLM Blog
+
+vLLM blog posts are published [here](https://blog.vllm.ai/).
diff --git a/docs/source/conf.py b/docs/source/conf.py
index 7aa52db092e36..ea3b56e02d1e8 100644
--- a/docs/source/conf.py
+++ b/docs/source/conf.py
@@ -1,3 +1,5 @@
+# SPDX-License-Identifier: Apache-2.0
+
 # Configuration file for the Sphinx documentation builder.
 #
 # This file only contains a selection of the most common options. For a full
@@ -70,6 +72,7 @@ copybutton_prompt_is_regexp = True
 html_title = project
 html_theme = 'sphinx_book_theme'
 html_logo = 'assets/logos/vllm-logo-text-light.png'
+html_favicon = 'assets/logos/vllm-logo-only-light.ico'
 html_theme_options = {
     'path_to_docs': 'docs/source',
     'repository_url': 'https://github.com/vllm-project/vllm',
diff --git a/docs/source/contributing/dockerfile/dockerfile.md b/docs/source/contributing/dockerfile/dockerfile.md
index cb142318b8724..96674805df534 100644
--- a/docs/source/contributing/dockerfile/dockerfile.md
+++ b/docs/source/contributing/dockerfile/dockerfile.md
@@ -17,11 +17,11 @@ The edges of the build graph represent:
 
 - `RUN --mount=(.\*)from=...` dependencies (with a dotted line and an empty diamond arrow head)
 
-  > ```{figure} /assets/contributing/dockerfile-stages-dependency.png
+  > :::{figure} /assets/contributing/dockerfile-stages-dependency.png
   > :align: center
   > :alt: query
   > :width: 100%
-  > ```
+  > :::
   >
   > Made using: <https://github.com/patrickhoefler/dockerfilegraph>
   >
diff --git a/docs/source/contributing/model/basic.md b/docs/source/contributing/model/basic.md
index b9b92fd027f6e..180fdd59e9a64 100644
--- a/docs/source/contributing/model/basic.md
+++ b/docs/source/contributing/model/basic.md
@@ -10,9 +10,9 @@ First, clone the PyTorch model code from the source repository.
 For instance, vLLM's [OPT model](gh-file:vllm/model_executor/models/opt.py) was adapted from
 HuggingFace's [modeling_opt.py](https://github.com/huggingface/transformers/blob/main/src/transformers/models/opt/modeling_opt.py) file.
 
-```{warning}
+:::{warning}
 Make sure to review and adhere to the original code's copyright and licensing terms!
-```
+:::
 
 ## 2. Make your code compatible with vLLM
 
@@ -80,10 +80,10 @@ def forward(
     ...
 ```
 
-```{note}
+:::{note}
 Currently, vLLM supports the basic multi-head attention mechanism and its variant with rotary positional embeddings.
 If your model employs a different attention mechanism, you will need to implement a new attention layer in vLLM.
-```
+:::
 
 For reference, check out our [Llama implementation](gh-file:vllm/model_executor/models/llama.py). vLLM already supports a large number of models. It is recommended to find a model similar to yours and adapt it to your model's architecture. Check out <gh-dir:vllm/model_executor/models> for more examples.
 
diff --git a/docs/source/contributing/model/index.md b/docs/source/contributing/model/index.md
index fe018b61b08cf..721ee3cd2047c 100644
--- a/docs/source/contributing/model/index.md
+++ b/docs/source/contributing/model/index.md
@@ -4,7 +4,7 @@
 
 This section provides more information on how to integrate a [PyTorch](https://pytorch.org/) model into vLLM.
 
-```{toctree}
+:::{toctree}
 :caption: Contents
 :maxdepth: 1
 
@@ -12,16 +12,16 @@ basic
 registration
 tests
 multimodal
-```
+:::
 
-```{note}
+:::{note}
 The complexity of adding a new model depends heavily on the model's architecture.
 The process is considerably straightforward if the model shares a similar architecture with an existing model in vLLM.
 However, for models that include new operators (e.g., a new attention mechanism), the process can be a bit more complex.
-```
+:::
 
-```{tip}
+:::{tip}
 If you are encountering issues while integrating your model into vLLM, feel free to open a [GitHub issue](https://github.com/vllm-project/vllm/issues)
 or ask on our [developer slack](https://slack.vllm.ai).
 We will be happy to help you out!
-```
+:::
diff --git a/docs/source/contributing/model/multimodal.md b/docs/source/contributing/model/multimodal.md
index e5fd9a2877ceb..6c6f3b701cd28 100644
--- a/docs/source/contributing/model/multimodal.md
+++ b/docs/source/contributing/model/multimodal.md
@@ -48,9 +48,9 @@ Further update the model as follows:
             return vision_embeddings
     ```
 
-    ```{important}
+    :::{important}
     The returned `multimodal_embeddings` must be either a **3D {class}`torch.Tensor`** of shape `(num_items, feature_size, hidden_size)`, or a **list / tuple of 2D {class}`torch.Tensor`'s** of shape `(feature_size, hidden_size)`, so that `multimodal_embeddings[i]` retrieves the embeddings generated from the `i`-th multimodal data item (e.g, image) of the request.
-    ```
+    :::
 
 - Implement {meth}`~vllm.model_executor.models.interfaces.SupportsMultiModal.get_input_embeddings` to merge `multimodal_embeddings` with text embeddings from the `input_ids`. If input processing for the model is implemented correctly (see sections below), then you can leverage the utility function we provide to easily merge the embeddings.
 
@@ -89,10 +89,10 @@ Further update the model as follows:
   + class YourModelForImage2Seq(nn.Module, SupportsMultiModal):
   ```
 
-  ```{note}
+  :::{note}
   The model class does not have to be named {code}`*ForCausalLM`.
   Check out [the HuggingFace Transformers documentation](https://huggingface.co/docs/transformers/model_doc/auto#multimodal) for some examples.
-  ```
+  :::
 
 ## 2. Specify processing information
 
@@ -120,8 +120,8 @@ When calling the model, the output embeddings from the visual encoder are assign
 containing placeholder feature tokens. Therefore, the number of placeholder feature tokens should be equal
 to the size of the output embeddings.
 
-::::{tab-set}
-:::{tab-item} Basic example: LLaVA
+:::::{tab-set}
+::::{tab-item} Basic example: LLaVA
 :sync: llava
 
 Looking at the code of HF's `LlavaForConditionalGeneration`:
@@ -254,12 +254,12 @@ def get_mm_max_tokens_per_item(self, seq_len: int) -> Mapping[str, int]:
     return {"image": self.get_max_image_tokens()}
 ```
 
-```{note}
+:::{note}
 Our [actual code](gh-file:vllm/model_executor/models/llava.py) is more abstracted to support vision encoders other than CLIP.
-```
-
 :::
+
 ::::
+:::::
 
 ## 3. Specify dummy inputs
 
@@ -315,17 +315,17 @@ def get_dummy_processor_inputs(
 Afterwards, create a subclass of {class}`~vllm.multimodal.processing.BaseMultiModalProcessor`
 to fill in the missing details about HF processing.
 
-```{seealso}
+:::{seealso}
 [Multi-Modal Data Processing](#mm-processing)
-```
+:::
 
 ### Multi-modal fields
 
 Override {class}`~vllm.multimodal.processing.BaseMultiModalProcessor._get_mm_fields_config` to
 return a schema of the tensors outputted by the HF processor that are related to the input multi-modal items.
 
-::::{tab-set}
-:::{tab-item} Basic example: LLaVA
+:::::{tab-set}
+::::{tab-item} Basic example: LLaVA
 :sync: llava
 
 Looking at the model's `forward` method:
@@ -367,13 +367,13 @@ def _get_mm_fields_config(
     )
 ```
 
-```{note}
+:::{note}
 Our [actual code](gh-file:vllm/model_executor/models/llava.py) additionally supports
 pre-computed image embeddings, which can be passed to be model via the `image_embeds` argument.
-```
-
 :::
+
 ::::
+:::::
 
 ### Prompt replacements
 
diff --git a/docs/source/contributing/model/registration.md b/docs/source/contributing/model/registration.md
index d6c9e4181dfee..64cd25b53807e 100644
--- a/docs/source/contributing/model/registration.md
+++ b/docs/source/contributing/model/registration.md
@@ -17,17 +17,17 @@ After you have implemented your model (see [tutorial](#new-model-basic)), put it
 Then, add your model class to `_VLLM_MODELS` in <gh-file:vllm/model_executor/models/registry.py> so that it is automatically registered upon importing vLLM.
 Finally, update our [list of supported models](#supported-models) to promote your model!
 
-```{important}
+:::{important}
 The list of models in each section should be maintained in alphabetical order.
-```
+:::
 
 ## Out-of-tree models
 
 You can load an external model using a plugin without modifying the vLLM codebase.
 
-```{seealso}
+:::{seealso}
 [vLLM's Plugin System](#plugin-system)
-```
+:::
 
 To register the model, use the following code:
 
@@ -45,11 +45,11 @@ from vllm import ModelRegistry
 ModelRegistry.register_model("YourModelForCausalLM", "your_code:YourModelForCausalLM")
 ```
 
-```{important}
+:::{important}
 If your model is a multimodal model, ensure the model class implements the {class}`~vllm.model_executor.models.interfaces.SupportsMultiModal` interface.
 Read more about that [here](#supports-multimodal).
-```
+:::
 
-```{note}
+:::{note}
 Although you can directly put these code snippets in your script using `vllm.LLM`, the recommended way is to place these snippets in a vLLM plugin. This ensures compatibility with various vLLM features like distributed inference and the API server.
-```
+:::
diff --git a/docs/source/contributing/model/tests.md b/docs/source/contributing/model/tests.md
index 74c933b2f45da..68d51d89f7cff 100644
--- a/docs/source/contributing/model/tests.md
+++ b/docs/source/contributing/model/tests.md
@@ -14,14 +14,14 @@ Without them, the CI for your PR will fail.
 Include an example HuggingFace repository for your model in <gh-file:tests/models/registry.py>.
 This enables a unit test that loads dummy weights to ensure that the model can be initialized in vLLM.
 
-```{important}
+:::{important}
 The list of models in each section should be maintained in alphabetical order.
-```
+:::
 
-```{tip}
+:::{tip}
 If your model requires a development version of HF Transformers, you can set
 `min_transformers_version` to skip the test in CI until the model is released.
-```
+:::
 
 ## Optional Tests
 
diff --git a/docs/source/contributing/overview.md b/docs/source/contributing/overview.md
index 36cf8e7440eca..af09bfecc6499 100644
--- a/docs/source/contributing/overview.md
+++ b/docs/source/contributing/overview.md
@@ -26,7 +26,7 @@ Check out the [building from source](#build-from-source) documentation for detai
 pip install -r requirements-dev.txt
 
 # Linting, formatting and static type checking
-pre-commit install
+pre-commit install --hook-type pre-commit --hook-type commit-msg
 
 # You can manually run pre-commit with
 pre-commit run --all-files
@@ -35,17 +35,17 @@ pre-commit run --all-files
 pytest tests/
 ```
 
-```{note}
+:::{note}
 Currently, the repository is not fully checked by `mypy`.
-```
+:::
 
 ## Issues
 
 If you encounter a bug or have a feature request, please [search existing issues](https://github.com/vllm-project/vllm/issues?q=is%3Aissue) first to see if it has already been reported. If not, please [file a new issue](https://github.com/vllm-project/vllm/issues/new/choose), providing as much relevant information as possible.
 
-```{important}
+:::{important}
 If you discover a security vulnerability, please follow the instructions [here](gh-file:SECURITY.md#reporting-a-vulnerability).
-```
+:::
 
 ## Pull Requests & Code Reviews
 
@@ -81,9 +81,9 @@ appropriately to indicate the type of change. Please use one of the following:
 - `[Misc]` for PRs that do not fit the above categories. Please use this
   sparingly.
 
-```{note}
+:::{note}
 If the PR spans more than one category, please include all relevant prefixes.
-```
+:::
 
 ### Code Quality
 
diff --git a/docs/source/contributing/profiling/profiling_index.md b/docs/source/contributing/profiling/profiling_index.md
index 001db86bdf555..79aeb292a9b73 100644
--- a/docs/source/contributing/profiling/profiling_index.md
+++ b/docs/source/contributing/profiling/profiling_index.md
@@ -6,21 +6,21 @@ The OpenAI server also needs to be started with the `VLLM_TORCH_PROFILER_DIR` en
 
 When using `benchmarks/benchmark_serving.py`, you can enable profiling by passing the `--profile` flag.
 
-```{warning}
+:::{warning}
 Only enable profiling in a development environment.
-```
+:::
 
 Traces can be visualized using <https://ui.perfetto.dev/>.
 
-```{tip}
+:::{tip}
 Only send a few requests through vLLM when profiling, as the traces can get quite large. Also, no need to untar the traces, they can be viewed directly.
-```
+:::
 
-```{tip}
+:::{tip}
 To stop the profiler - it flushes out all the profile trace files to the directory. This takes time, for example for about 100 requests worth of data for a llama 70b, it takes about 10 minutes to flush out on a H100.
 Set the env variable VLLM_RPC_TIMEOUT to a big number before you start the server. Say something like 30 minutes.
 `export VLLM_RPC_TIMEOUT=1800000`
-```
+:::
 
 ## Example commands and usage
 
diff --git a/docs/source/deployment/docker.md b/docs/source/deployment/docker.md
index 438be47316f3b..334c02225bd6b 100644
--- a/docs/source/deployment/docker.md
+++ b/docs/source/deployment/docker.md
@@ -21,11 +21,11 @@ $ docker run --runtime nvidia --gpus all \
 
 You can add any other <project:#engine-args> you need after the image tag (`vllm/vllm-openai:latest`).
 
-```{note}
+:::{note}
 You can either use the `ipc=host` flag or `--shm-size` flag to allow the
 container to access the host's shared memory. vLLM uses PyTorch, which uses shared
 memory to share data between processes under the hood, particularly for tensor parallel inference.
-```
+:::
 
 (deployment-docker-build-image-from-source)=
 
@@ -38,25 +38,25 @@ You can build and run vLLM from source via the provided <gh-file:Dockerfile>. To
 DOCKER_BUILDKIT=1 docker build . --target vllm-openai --tag vllm/vllm-openai
 ```
 
-```{note}
+:::{note}
 By default vLLM will build for all GPU types for widest distribution. If you are just building for the
 current GPU type the machine is running on, you can add the argument `--build-arg torch_cuda_arch_list=""`
 for vLLM to find the current GPU type and build for that.
 
 If you are using Podman instead of Docker, you might need to disable SELinux labeling by
 adding `--security-opt label=disable` when running `podman build` command to avoid certain [existing issues](https://github.com/containers/buildah/discussions/4184).
-```
+:::
 
 ## Building for Arm64/aarch64
 
 A docker container can be built for aarch64 systems such as the Nvidia Grace-Hopper. At time of this writing, this requires the use
 of PyTorch Nightly and should be considered **experimental**. Using the flag `--platform "linux/arm64"` will attempt to build for arm64.
 
-```{note}
+:::{note}
 Multiple modules must be compiled, so this process can take a while. Recommend using `--build-arg max_jobs=` & `--build-arg nvcc_threads=`
 flags to speed up build process. However, ensure your `max_jobs` is substantially larger than `nvcc_threads` to get the most benefits.
 Keep an eye on memory usage with parallel jobs as it can be substantial (see example below).
-```
+:::
 
 ```console
 # Example of building on Nvidia GH200 server. (Memory usage: ~15GB, Build time: ~1475s / ~25 min, Image size: 6.93GB)
@@ -85,6 +85,6 @@ $ docker run --runtime nvidia --gpus all \
 
 The argument `vllm/vllm-openai` specifies the image to run, and should be replaced with the name of the custom-built image (the `-t` tag from the build command).
 
-```{note}
+:::{note}
 **For version 0.4.1 and 0.4.2 only** - the vLLM docker images under these versions are supposed to be run under the root user since a library under the root user's home directory, i.e. `/root/.config/vllm/nccl/cu12/libnccl.so.2.18.1` is required to be loaded during runtime. If you are running the container under a different user, you may need to first change the permissions of the library (and all the parent directories) to allow the user to access it, then run vLLM with environment variable `VLLM_NCCL_SO_PATH=/root/.config/vllm/nccl/cu12/libnccl.so.2.18.1` .
-```
+:::
diff --git a/docs/source/deployment/frameworks/cerebrium.md b/docs/source/deployment/frameworks/cerebrium.md
index 5787c4a407bfb..b20c95137b6e7 100644
--- a/docs/source/deployment/frameworks/cerebrium.md
+++ b/docs/source/deployment/frameworks/cerebrium.md
@@ -2,11 +2,11 @@
 
 # Cerebrium
 
-```{raw} html
+:::{raw} html
 <p align="center">
     <img src="https://i.ibb.co/hHcScTT/Screenshot-2024-06-13-at-10-14-54.png" alt="vLLM_plus_cerebrium"/>
 </p>
-```
+:::
 
 vLLM can be run on a cloud based GPU machine with [Cerebrium](https://www.cerebrium.ai/), a serverless AI infrastructure platform that makes it easier for companies to build and deploy AI based applications.
 
diff --git a/docs/source/deployment/frameworks/dstack.md b/docs/source/deployment/frameworks/dstack.md
index b42a34125c6d7..a16e28f2d8983 100644
--- a/docs/source/deployment/frameworks/dstack.md
+++ b/docs/source/deployment/frameworks/dstack.md
@@ -2,11 +2,11 @@
 
 # dstack
 
-```{raw} html
+:::{raw} html
 <p align="center">
     <img src="https://i.ibb.co/71kx6hW/vllm-dstack.png" alt="vLLM_plus_dstack"/>
 </p>
-```
+:::
 
 vLLM can be run on a cloud based GPU machine with [dstack](https://dstack.ai/), an open-source framework for running LLMs on any cloud. This tutorial assumes that you have already configured credentials, gateway, and GPU quotas on your cloud environment.
 
@@ -97,6 +97,6 @@ completion = client.chat.completions.create(
 print(completion.choices[0].message.content)
 ```
 
-```{note}
+:::{note}
 dstack automatically handles authentication on the gateway using dstack's tokens. Meanwhile, if you don't want to configure a gateway, you can provision dstack `Task` instead of `Service`. The `Task` is for development purpose only. If you want to know more about hands-on materials how to serve vLLM using dstack, check out [this repository](https://github.com/dstackai/dstack-examples/tree/main/deployment/vllm)
-```
+:::
diff --git a/docs/source/deployment/frameworks/helm.md b/docs/source/deployment/frameworks/helm.md
index 18ed293191468..e4fc5e1313079 100644
--- a/docs/source/deployment/frameworks/helm.md
+++ b/docs/source/deployment/frameworks/helm.md
@@ -38,213 +38,213 @@ chart **including persistent volumes** and deletes the release.
 
 ## Architecture
 
-```{image} /assets/deployment/architecture_helm_deployment.png
-```
+:::{image} /assets/deployment/architecture_helm_deployment.png
+:::
 
 ## Values
 
-```{list-table}
+:::{list-table}
 :widths: 25 25 25 25
 :header-rows: 1
 
-* - Key
-  - Type
-  - Default
-  - Description
-* - autoscaling
-  - object
-  - {"enabled":false,"maxReplicas":100,"minReplicas":1,"targetCPUUtilizationPercentage":80}
-  - Autoscaling configuration
-* - autoscaling.enabled
-  - bool
-  - false
-  - Enable autoscaling
-* - autoscaling.maxReplicas
-  - int
-  - 100
-  - Maximum replicas
-* - autoscaling.minReplicas
-  - int
-  - 1
-  - Minimum replicas
-* - autoscaling.targetCPUUtilizationPercentage
-  - int
-  - 80
-  - Target CPU utilization for autoscaling
-* - configs
-  - object
-  - {}
-  - Configmap
-* - containerPort
-  - int
-  - 8000
-  - Container port
-* - customObjects
-  - list
-  - []
-  - Custom Objects configuration
-* - deploymentStrategy
-  - object
-  - {}
-  - Deployment strategy configuration
-* - externalConfigs
-  - list
-  - []
-  - External configuration
-* - extraContainers
-  - list
-  - []
-  - Additional containers configuration
-* - extraInit
-  - object
-  - {"pvcStorage":"1Gi","s3modelpath":"relative_s3_model_path/opt-125m", "awsEc2MetadataDisabled": true}
-  - Additional configuration for the init container
-* - extraInit.pvcStorage
-  - string
-  - "50Gi"
-  - Storage size of the s3
-* - extraInit.s3modelpath
-  - string
-  - "relative_s3_model_path/opt-125m"
-  - Path of the model on the s3 which hosts model weights and config files
-* - extraInit.awsEc2MetadataDisabled
-  - boolean
-  - true
-  - Disables the use of the Amazon EC2 instance metadata service
-* - extraPorts
-  - list
-  - []
-  - Additional ports configuration
-* - gpuModels
-  - list
-  - ["TYPE_GPU_USED"]
-  - Type of gpu used
-* - image
-  - object
-  - {"command":["vllm","serve","/data/","--served-model-name","opt-125m","--host","0.0.0.0","--port","8000"],"repository":"vllm/vllm-openai","tag":"latest"}
-  - Image configuration
-* - image.command
-  - list
-  - ["vllm","serve","/data/","--served-model-name","opt-125m","--host","0.0.0.0","--port","8000"]
-  - Container launch command
-* - image.repository
-  - string
-  - "vllm/vllm-openai"
-  - Image repository
-* - image.tag
-  - string
-  - "latest"
-  - Image tag
-* - livenessProbe
-  - object
-  - {"failureThreshold":3,"httpGet":{"path":"/health","port":8000},"initialDelaySeconds":15,"periodSeconds":10}
-  - Liveness probe configuration
-* - livenessProbe.failureThreshold
-  - int
-  - 3
-  - Number of times after which if a probe fails in a row, Kubernetes considers that the overall check has failed: the container is not alive
-* - livenessProbe.httpGet
-  - object
-  - {"path":"/health","port":8000}
-  - Configuration of the Kubelet http request on the server
-* - livenessProbe.httpGet.path
-  - string
-  - "/health"
-  - Path to access on the HTTP server
-* - livenessProbe.httpGet.port
-  - int
-  - 8000
-  - Name or number of the port to access on the container, on which the server is listening
-* - livenessProbe.initialDelaySeconds
-  - int
-  - 15
-  - Number of seconds after the container has started before liveness probe is initiated
-* - livenessProbe.periodSeconds
-  - int
-  - 10
-  - How often (in seconds) to perform the liveness probe
-* - maxUnavailablePodDisruptionBudget
-  - string
-  - ""
-  - Disruption Budget Configuration
-* - readinessProbe
-  - object
-  - {"failureThreshold":3,"httpGet":{"path":"/health","port":8000},"initialDelaySeconds":5,"periodSeconds":5}
-  - Readiness probe configuration
-* - readinessProbe.failureThreshold
-  - int
-  - 3
-  - Number of times after which if a probe fails in a row, Kubernetes considers that the overall check has failed: the container is not ready
-* - readinessProbe.httpGet
-  - object
-  - {"path":"/health","port":8000}
-  - Configuration of the Kubelet http request on the server
-* - readinessProbe.httpGet.path
-  - string
-  - "/health"
-  - Path to access on the HTTP server
-* - readinessProbe.httpGet.port
-  - int
-  - 8000
-  - Name or number of the port to access on the container, on which the server is listening
-* - readinessProbe.initialDelaySeconds
-  - int
-  - 5
-  - Number of seconds after the container has started before readiness probe is initiated
-* - readinessProbe.periodSeconds
-  - int
-  - 5
-  - How often (in seconds) to perform the readiness probe
-* - replicaCount
-  - int
-  - 1
-  - Number of replicas
-* - resources
-  - object
-  - {"limits":{"cpu":4,"memory":"16Gi","nvidia.com/gpu":1},"requests":{"cpu":4,"memory":"16Gi","nvidia.com/gpu":1}}
-  - Resource configuration
-* - resources.limits."nvidia.com/gpu"
-  - int
-  - 1
-  - Number of gpus used
-* - resources.limits.cpu
-  - int
-  - 4
-  - Number of CPUs
-* - resources.limits.memory
-  - string
-  - "16Gi"
-  - CPU memory configuration
-* - resources.requests."nvidia.com/gpu"
-  - int
-  - 1
-  - Number of gpus used
-* - resources.requests.cpu
-  - int
-  - 4
-  - Number of CPUs
-* - resources.requests.memory
-  - string
-  - "16Gi"
-  - CPU memory configuration
-* - secrets
-  - object
-  - {}
-  - Secrets configuration
-* - serviceName
-  - string
-  -
-  - Service name
-* - servicePort
-  - int
-  - 80
-  - Service port
-* - labels.environment
-  - string
-  - test
-  - Environment name
-* - labels.release
-  - string
-  - test
-  - Release name
-```
+- * Key
+  * Type
+  * Default
+  * Description
+- * autoscaling
+  * object
+  * {"enabled":false,"maxReplicas":100,"minReplicas":1,"targetCPUUtilizationPercentage":80}
+  * Autoscaling configuration
+- * autoscaling.enabled
+  * bool
+  * false
+  * Enable autoscaling
+- * autoscaling.maxReplicas
+  * int
+  * 100
+  * Maximum replicas
+- * autoscaling.minReplicas
+  * int
+  * 1
+  * Minimum replicas
+- * autoscaling.targetCPUUtilizationPercentage
+  * int
+  * 80
+  * Target CPU utilization for autoscaling
+- * configs
+  * object
+  * {}
+  * Configmap
+- * containerPort
+  * int
+  * 8000
+  * Container port
+- * customObjects
+  * list
+  * []
+  * Custom Objects configuration
+- * deploymentStrategy
+  * object
+  * {}
+  * Deployment strategy configuration
+- * externalConfigs
+  * list
+  * []
+  * External configuration
+- * extraContainers
+  * list
+  * []
+  * Additional containers configuration
+- * extraInit
+  * object
+  * {"pvcStorage":"1Gi","s3modelpath":"relative_s3_model_path/opt-125m", "awsEc2MetadataDisabled": true}
+  * Additional configuration for the init container
+- * extraInit.pvcStorage
+  * string
+  * "50Gi"
+  * Storage size of the s3
+- * extraInit.s3modelpath
+  * string
+  * "relative_s3_model_path/opt-125m"
+  * Path of the model on the s3 which hosts model weights and config files
+- * extraInit.awsEc2MetadataDisabled
+  * boolean
+  * true
+  * Disables the use of the Amazon EC2 instance metadata service
+- * extraPorts
+  * list
+  * []
+  * Additional ports configuration
+- * gpuModels
+  * list
+  * ["TYPE_GPU_USED"]
+  * Type of gpu used
+- * image
+  * object
+  * {"command":["vllm","serve","/data/","--served-model-name","opt-125m","--host","0.0.0.0","--port","8000"],"repository":"vllm/vllm-openai","tag":"latest"}
+  * Image configuration
+- * image.command
+  * list
+  * ["vllm","serve","/data/","--served-model-name","opt-125m","--host","0.0.0.0","--port","8000"]
+  * Container launch command
+- * image.repository
+  * string
+  * "vllm/vllm-openai"
+  * Image repository
+- * image.tag
+  * string
+  * "latest"
+  * Image tag
+- * livenessProbe
+  * object
+  * {"failureThreshold":3,"httpGet":{"path":"/health","port":8000},"initialDelaySeconds":15,"periodSeconds":10}
+  * Liveness probe configuration
+- * livenessProbe.failureThreshold
+  * int
+  * 3
+  * Number of times after which if a probe fails in a row, Kubernetes considers that the overall check has failed: the container is not alive
+- * livenessProbe.httpGet
+  * object
+  * {"path":"/health","port":8000}
+  * Configuration of the Kubelet http request on the server
+- * livenessProbe.httpGet.path
+  * string
+  * "/health"
+  * Path to access on the HTTP server
+- * livenessProbe.httpGet.port
+  * int
+  * 8000
+  * Name or number of the port to access on the container, on which the server is listening
+- * livenessProbe.initialDelaySeconds
+  * int
+  * 15
+  * Number of seconds after the container has started before liveness probe is initiated
+- * livenessProbe.periodSeconds
+  * int
+  * 10
+  * How often (in seconds) to perform the liveness probe
+- * maxUnavailablePodDisruptionBudget
+  * string
+  * ""
+  * Disruption Budget Configuration
+- * readinessProbe
+  * object
+  * {"failureThreshold":3,"httpGet":{"path":"/health","port":8000},"initialDelaySeconds":5,"periodSeconds":5}
+  * Readiness probe configuration
+- * readinessProbe.failureThreshold
+  * int
+  * 3
+  * Number of times after which if a probe fails in a row, Kubernetes considers that the overall check has failed: the container is not ready
+- * readinessProbe.httpGet
+  * object
+  * {"path":"/health","port":8000}
+  * Configuration of the Kubelet http request on the server
+- * readinessProbe.httpGet.path
+  * string
+  * "/health"
+  * Path to access on the HTTP server
+- * readinessProbe.httpGet.port
+  * int
+  * 8000
+  * Name or number of the port to access on the container, on which the server is listening
+- * readinessProbe.initialDelaySeconds
+  * int
+  * 5
+  * Number of seconds after the container has started before readiness probe is initiated
+- * readinessProbe.periodSeconds
+  * int
+  * 5
+  * How often (in seconds) to perform the readiness probe
+- * replicaCount
+  * int
+  * 1
+  * Number of replicas
+- * resources
+  * object
+  * {"limits":{"cpu":4,"memory":"16Gi","nvidia.com/gpu":1},"requests":{"cpu":4,"memory":"16Gi","nvidia.com/gpu":1}}
+  * Resource configuration
+- * resources.limits."nvidia.com/gpu"
+  * int
+  * 1
+  * Number of gpus used
+- * resources.limits.cpu
+  * int
+  * 4
+  * Number of CPUs
+- * resources.limits.memory
+  * string
+  * "16Gi"
+  * CPU memory configuration
+- * resources.requests."nvidia.com/gpu"
+  * int
+  * 1
+  * Number of gpus used
+- * resources.requests.cpu
+  * int
+  * 4
+  * Number of CPUs
+- * resources.requests.memory
+  * string
+  * "16Gi"
+  * CPU memory configuration
+- * secrets
+  * object
+  * {}
+  * Secrets configuration
+- * serviceName
+  * string
+  *
+  * Service name
+- * servicePort
+  * int
+  * 80
+  * Service port
+- * labels.environment
+  * string
+  * test
+  * Environment name
+- * labels.release
+  * string
+  * test
+  * Release name
+:::
diff --git a/docs/source/deployment/frameworks/index.md b/docs/source/deployment/frameworks/index.md
index 964782763f6b3..cb758d3e6d2e4 100644
--- a/docs/source/deployment/frameworks/index.md
+++ b/docs/source/deployment/frameworks/index.md
@@ -1,6 +1,6 @@
 # Using other frameworks
 
-```{toctree}
+:::{toctree}
 :maxdepth: 1
 
 bentoml
@@ -11,4 +11,4 @@ lws
 modal
 skypilot
 triton
-```
+:::
diff --git a/docs/source/deployment/frameworks/skypilot.md b/docs/source/deployment/frameworks/skypilot.md
index 051fc2f2a8d4e..5e101b9001033 100644
--- a/docs/source/deployment/frameworks/skypilot.md
+++ b/docs/source/deployment/frameworks/skypilot.md
@@ -2,11 +2,11 @@
 
 # SkyPilot
 
-```{raw} html
+:::{raw} html
 <p align="center">
   <img src="https://imgur.com/yxtzPEu.png" alt="vLLM"/>
 </p>
-```
+:::
 
 vLLM can be **run and scaled to multiple service replicas on clouds and Kubernetes** with [SkyPilot](https://github.com/skypilot-org/skypilot), an open-source framework for running LLMs on any cloud. More examples for various open models, such as Llama-3, Mixtral, etc, can be found in [SkyPilot AI gallery](https://skypilot.readthedocs.io/en/latest/gallery/index.html).
 
@@ -104,10 +104,10 @@ service:
   max_completion_tokens: 1
 ```
 
-```{raw} html
+:::{raw} html
 <details>
 <summary>Click to see the full recipe YAML</summary>
-```
+:::
 
 ```yaml
 service:
@@ -153,9 +153,9 @@ run: |
     2>&1 | tee api_server.log
 ```
 
-```{raw} html
+:::{raw} html
 </details>
-```
+:::
 
 Start the serving the Llama-3 8B model on multiple replicas:
 
@@ -169,10 +169,10 @@ Wait until the service is ready:
 watch -n10 sky serve status vllm
 ```
 
-```{raw} html
+:::{raw} html
 <details>
 <summary>Example outputs:</summary>
-```
+:::
 
 ```console
 Services
@@ -185,9 +185,9 @@ vllm          1   1        xx.yy.zz.121  18 mins ago  1x GCP([Spot]{'L4': 1})  R
 vllm          2   1        xx.yy.zz.245  18 mins ago  1x GCP([Spot]{'L4': 1})  READY   us-east4
 ```
 
-```{raw} html
+:::{raw} html
 </details>
-```
+:::
 
 After the service is READY, you can find a single endpoint for the service and access the service with the endpoint:
 
@@ -223,10 +223,10 @@ service:
 
 This will scale the service up to when the QPS exceeds 2 for each replica.
 
-```{raw} html
+:::{raw} html
 <details>
 <summary>Click to see the full recipe YAML</summary>
-```
+:::
 
 ```yaml
 service:
@@ -275,9 +275,9 @@ run: |
     2>&1 | tee api_server.log
 ```
 
-```{raw} html
+:::{raw} html
 </details>
-```
+:::
 
 To update the service with the new config:
 
@@ -295,10 +295,10 @@ sky serve down vllm
 
 It is also possible to access the Llama-3 service with a separate GUI frontend, so the user requests send to the GUI will be load-balanced across replicas.
 
-```{raw} html
+:::{raw} html
 <details>
 <summary>Click to see the full GUI YAML</summary>
-```
+:::
 
 ```yaml
 envs:
@@ -328,9 +328,9 @@ run: |
     --stop-token-ids 128009,128001 | tee ~/gradio.log
 ```
 
-```{raw} html
+:::{raw} html
 </details>
-```
+:::
 
 1. Start the chat web UI:
 
diff --git a/docs/source/deployment/integrations/index.md b/docs/source/deployment/integrations/index.md
index d47ede8967547..c286edb4d7bc1 100644
--- a/docs/source/deployment/integrations/index.md
+++ b/docs/source/deployment/integrations/index.md
@@ -1,9 +1,9 @@
 # External Integrations
 
-```{toctree}
+:::{toctree}
 :maxdepth: 1
 
 kserve
 kubeai
 llamastack
-```
+:::
diff --git a/docs/source/deployment/nginx.md b/docs/source/deployment/nginx.md
index a58f791c2997b..87feb48856853 100644
--- a/docs/source/deployment/nginx.md
+++ b/docs/source/deployment/nginx.md
@@ -105,9 +105,9 @@ docker run -itd --ipc host --privileged --network vllm_nginx --gpus all --shm-si
 docker run -itd --ipc host --privileged --network vllm_nginx --gpus all --shm-size=10.24gb -v $hf_cache_dir:/root/.cache/huggingface/ -p 8082:8000 --name vllm1 vllm --model meta-llama/Llama-2-7b-chat-hf
 ```
 
-```{note}
+:::{note}
 If you are behind proxy, you can pass the proxy settings to the docker run command via `-e http_proxy=$http_proxy -e https_proxy=$https_proxy`.
-```
+:::
 
 (nginxloadbalancer-nginx-launch-nginx)=
 
diff --git a/docs/source/design/arch_overview.md b/docs/source/design/arch_overview.md
index cec503ef2f77d..04886e5981eef 100644
--- a/docs/source/design/arch_overview.md
+++ b/docs/source/design/arch_overview.md
@@ -4,19 +4,19 @@
 
 This document provides an overview of the vLLM architecture.
 
-```{contents} Table of Contents
+:::{contents} Table of Contents
 :depth: 2
 :local: true
-```
+:::
 
 ## Entrypoints
 
 vLLM provides a number of entrypoints for interacting with the system. The
 following diagram shows the relationship between them.
 
-```{image} /assets/design/arch_overview/entrypoints.excalidraw.png
+:::{image} /assets/design/arch_overview/entrypoints.excalidraw.png
 :alt: Entrypoints Diagram
-```
+:::
 
 ### LLM Class
 
@@ -84,9 +84,9 @@ More details on the API server can be found in the [OpenAI-Compatible Server](#o
 The `LLMEngine` and `AsyncLLMEngine` classes are central to the functioning of
 the vLLM system, handling model inference and asynchronous request processing.
 
-```{image} /assets/design/arch_overview/llm_engine.excalidraw.png
+:::{image} /assets/design/arch_overview/llm_engine.excalidraw.png
 :alt: LLMEngine Diagram
-```
+:::
 
 ### LLMEngine
 
@@ -144,11 +144,11 @@ configurations affect the class we ultimately get.
 
 The following figure shows the class hierarchy of vLLM:
 
-> ```{figure} /assets/design/hierarchy.png
+> :::{figure} /assets/design/hierarchy.png
 > :align: center
 > :alt: query
 > :width: 100%
-> ```
+> :::
 
 There are several important design choices behind this class hierarchy:
 
@@ -178,7 +178,7 @@ of a vision model and a language model. By making the constructor uniform, we
 can easily create a vision model and a language model and compose them into a
 vision-language model.
 
-````{note}
+:::{note}
 To support this change, all vLLM models' signatures have been updated to:
 
 ```python
@@ -215,7 +215,7 @@ else:
 ```
 
 This way, the model can work with both old and new versions of vLLM.
-````
+:::
 
 3\. **Sharding and Quantization at Initialization**: Certain features require
 changing the model weights. For example, tensor parallelism needs to shard the
diff --git a/docs/source/design/kernel/paged_attention.md b/docs/source/design/kernel/paged_attention.md
index f896f903c78f5..5f2582877260a 100644
--- a/docs/source/design/kernel/paged_attention.md
+++ b/docs/source/design/kernel/paged_attention.md
@@ -139,26 +139,26 @@
   const scalar_t* q_ptr = q + seq_idx * q_stride + head_idx * HEAD_SIZE;
   ```
 
-  ```{figure} ../../assets/kernel/query.png
+  :::{figure} ../../assets/kernel/query.png
   :align: center
   :alt: query
   :width: 70%
 
   Query data of one token at one head
-  ```
+  :::
 
 - Each thread defines its own `q_ptr` which points to the assigned
   query token data on global memory. For example, if `VEC_SIZE` is 4
   and `HEAD_SIZE` is 128, the `q_ptr` points to data that contains
   total of 128 elements divided into 128 / 4 = 32 vecs.
 
-  ```{figure} ../../assets/kernel/q_vecs.png
+  :::{figure} ../../assets/kernel/q_vecs.png
   :align: center
   :alt: q_vecs
   :width: 70%
 
   `q_vecs` for one thread group
-  ```
+  :::
 
   ```cpp
   __shared__ Q_vec q_vecs[THREAD_GROUP_SIZE][NUM_VECS_PER_THREAD];
@@ -195,13 +195,13 @@
   points to key token data based on `k_cache` at assigned block,
   assigned head and assigned token.
 
-  ```{figure} ../../assets/kernel/key.png
+  :::{figure} ../../assets/kernel/key.png
   :align: center
   :alt: key
   :width: 70%
 
   Key data of all context tokens at one head
-  ```
+  :::
 
 - The diagram above illustrates the memory layout for key data. It
   assumes that the `BLOCK_SIZE` is 16, `HEAD_SIZE` is 128, `x` is
@@ -214,13 +214,13 @@
   elements for one token) that will be processed by 2 threads (one
   thread group) separately.
 
-  ```{figure} ../../assets/kernel/k_vecs.png
+  :::{figure} ../../assets/kernel/k_vecs.png
   :align: center
   :alt: k_vecs
   :width: 70%
 
   `k_vecs` for one thread
-  ```
+  :::
 
   ```cpp
   K_vec k_vecs[NUM_VECS_PER_THREAD]
@@ -289,14 +289,14 @@
   should be performed across the entire thread block, encompassing
   results between the query token and all context key tokens.
 
-  ```{math}
+  :::{math}
   :nowrap: true
 
   \begin{gather*}
   m(x):=\max _i \quad x_i \\ \quad f(x):=\left[\begin{array}{lll}e^{x_1-m(x)} & \ldots & e^{x_B-m(x)}\end{array}\right]\\ \quad \ell(x):=\sum_i f(x)_i \\
   \quad \operatorname{softmax}(x):=\frac{f(x)}{\ell(x)}
   \end{gather*}
-  ```
+  :::
 
 ### `qk_max` and `logits`
 
@@ -379,29 +379,29 @@
 
 ## Value
 
-```{figure} ../../assets/kernel/value.png
+:::{figure} ../../assets/kernel/value.png
 :align: center
 :alt: value
 :width: 70%
 
 Value data of all context tokens at one head
-```
+:::
 
-```{figure} ../../assets/kernel/logits_vec.png
+:::{figure} ../../assets/kernel/logits_vec.png
 :align: center
 :alt: logits_vec
 :width: 50%
 
 `logits_vec` for one thread
-```
+:::
 
-```{figure} ../../assets/kernel/v_vec.png
+:::{figure} ../../assets/kernel/v_vec.png
 :align: center
 :alt: v_vec
 :width: 70%
 
 List of `v_vec` for one thread
-```
+:::
 
 - Now we need to retrieve the value data and perform dot multiplication
   with `logits`. Unlike query and key, there is no thread group
diff --git a/docs/source/design/multiprocessing.md b/docs/source/design/multiprocessing.md
index c2cdb75ea08a7..55dae0bb92d4e 100644
--- a/docs/source/design/multiprocessing.md
+++ b/docs/source/design/multiprocessing.md
@@ -7,9 +7,9 @@ page for information on known issues and how to solve them.
 
 ## Introduction
 
-```{important}
+:::{important}
 The source code references are to the state of the code at the time of writing in December, 2024.
-```
+:::
 
 The use of Python multiprocessing in vLLM is complicated by:
 
diff --git a/docs/source/design/v1/prefix_caching.md b/docs/source/design/v1/prefix_caching.md
new file mode 100644
index 0000000000000..dc8432baef9d9
--- /dev/null
+++ b/docs/source/design/v1/prefix_caching.md
@@ -0,0 +1,228 @@
+# Automatic Prefix Caching
+
+Prefix caching kv-cache blocks is a popular optimization in LLM inference to avoid redundant prompt computations. The core idea is simple – we cache the kv-cache blocks of processed requests, and reuse these blocks when a new request comes in with the same prefix as previous requests. Since prefix caching is almost a free lunch and won’t change model outputs, it has been widely used by many public endpoints (e.g., OpenAI, Anthropic, etc) and most open source LLM inference frameworks (e.g., SGLang).
+
+While there are many ways to implement prefix caching, vLLM chooses a hash-based approach. Specifically, we hash each kv-cache block by the tokens in the block and the tokens in the prefix before the block:
+
+```text
+                    Block 1                  Block 2                  Block 3
+         [A gentle breeze stirred] [the leaves as children] [laughed in the distance]
+Block 1: |<--- block tokens ---->|
+Block 2: |<------- prefix ------>| |<--- block tokens --->|
+Block 3: |<------------------ prefix -------------------->| |<--- block tokens ---->|
+```
+
+In the example above, the KV cache in the first block can be uniquely identified with the token “A gentle breeze stirred”. The third block can be uniquely identified with the tokens in the block “laughed in the distance”, along with the prefix tokens “A gentle breeze stirred the leaves as children”. Therefore, we can build the block hash of `hash(tuple[components])`, where components are:
+
+* Parent hash value: The hash value of the parent hash block.
+* Block tokens: A tuple of tokens in this block. The reason to include the exact tokens is to reduce potential hash value collision.  
+* Extra hashes: Other values required to make this block unique, such as LoRA IDs and multi-modality input hashes (see the example below).
+
+Note 1: We only cache full blocks.
+
+Note 2: The above hash key structure is not 100% collision free. Theoretically it’s still possible for the different prefix tokens to have the same hash value, but this should be nearly impossible to happen. Of course, contributions are welcome if you have an awesome idea to eliminate collusion entirely.
+
+**A hashing example with multi-modality inputs**  
+In this example, we illustrate how prefix caching works with multi-modality inputs (e.g., images). Assuming we have a request with the following messages:
+
+```text
+messages = [
+    {"role": "user",
+     "content": [
+         {"type": "text",
+          "text": "What's in this image?"
+         },
+         {"type": "image_url",
+          "image_url": {"url": image_url},
+         },
+    ]},
+]
+```
+
+It will become the following prompt:
+
+```text
+Prompt:
+    <s>[INST]What's in this image?\n[IMG][/INST]
+
+Tokenized prompt:
+    [1, 3, 7493, 1681, 1294, 1593, 3937, 9551, 10, 4]
+
+Prompt with placeholders (<P>):
+    [1, 3, 7493, 1681, 1294, 1593, 3937, 9551, <P>, <P>, ..., <P>, 4]
+```
+
+As we can see, after the tokenization, the `[IMG]` will be replaced by a sequence of placeholder tokens, and these placeholders will be replaced by image embeddings during prefill. The challenge for prefix caching to support this case is we need to differentiate images from the placeholders. To address this problem, we encode the image hash generated by the frontend image processor. For example, the hash of the blocks in the above prompt would be (assuming block size 16, and we have 41 placeholder tokens):
+
+```text
+Block 0
+    Parent hash: None
+    Token IDs: 1, 3, 7493, 1681, 1294, 1593, 3937, 9551, <p>, ..., <p>
+    Extra hash: <image hash>
+Block 1
+    Parent hash: Block 0 hash
+    Token IDs: <p>, ..., <p>
+    Extra hash: <image hash>
+Block 2
+    Parent hash: Block 1 hash
+    Token IDs: <p>, ..., <p>
+    Extra hash: <image hash>
+Block 3
+    Parent hash: Block 2 hash
+    Token IDs: <p>, ..., <p>, 4
+    Extra hash: <image hash>
+```
+
+In the rest of this document, we first introduce the data structure used for prefix caching in vLLM v1, followed by the prefix caching workflow of major KV cache operators (e.g., allocate, append, free, eviction). Finally, we use an example to illustrate the end to end prefix caching workflow.
+
+## Data Structure
+
+The prefix caching in vLLM v1 is implemented in the KV cache manager. The basic building block is the “Block” data class (simplified):
+
+```python
+class KVCacheBlock:
+    # The block ID (immutable)
+    block_id: int
+    # The block hash (will be assigned when the block is full,
+    # and will be reset when the block is evicted).
+    block_hash: BlockHashType
+    # The number of requests using this block now.
+    ref_cnt: int
+
+    # The pointers to form a doubly linked list for the free queue.
+    prev_free_block: Optional["KVCacheBlock"] = None
+    next_free_block: Optional["KVCacheBlock"] = None
+```
+
+There are two design points to highlight:
+
+1. We allocate all KVCacheBlock when initializing the KV cache manager to be a block pool. This avoids Python object creation overheads and can easily track all blocks all the time.  
+2. We introduce doubly linked list pointers directly in the KVCacheBlock, so that we could construct a free queue directly. This gives us two benefits:  
+   1. We could have O(1) complexity moving elements in the middle to the tail.  
+   2. We could avoid introducing another Python queue (e.g., `deque`) which has a wrapper to the elements.
+
+As a result, we will have the following components when the KV cache manager is initialized:
+
+:::{image} /assets/design/v1/prefix_caching/overview.png
+:alt: Component Overview
+:::
+
+* Block Pool: A list of KVCacheBlock.  
+* Free Block Queue: Only store the pointers of head and tail blocks for manipulations.  
+* Cache blocks: Mapping from hash key to block IDs.  
+* Request blocks: Mapping from request ID to allocated block IDs.
+
+## Operations
+
+### Block Allocation
+
+**New request:** Workflow for the scheduler to schedule a new request with KV cache block allocation:
+
+1. The scheduler calls `kv_cache_manager.get_computed_blocks()` to get a sequence of blocks that have already been computed. This is done by hashing the prompt tokens in the request and looking up Cache Blocks.  
+2. The scheduler calls `kv_cache_manager.allocate_slots()`. It does the following steps:  
+   1. Compute the number of new required blocks, and return if there are no sufficient blocks to allocate.  
+   2. “Touch” the computed blocks. It increases the reference count of the computed block by one, and removes the block from the free queue if the block wasn’t used by other requests. This is to avoid these computed blocks being evicted. See the example in the next section for illustration.  
+   3. Allocate new blocks by popping the heads of the free queue. If the head block is a cached block, this also “evicts” the block so that no other requests can reuse it anymore from now on.  
+   4. If an allocated block is already full of tokens, we immediately add it to the Cache Block, so that the block can be reused by other requests in the same batch.
+
+**Running request:** Workflow for the scheduler to schedule a running request with KV cache block allocation:
+
+1. The scheduler calls `kv_cache_manager.append_slots()`. It does the following steps:  
+   1. Compute the number of new required blocks, and return if there are no sufficient blocks to allocate.  
+   2. Allocate new blocks by popping the heads of the free queue. If the head block is a cached block, this also “evicts” the block so that no other requests can reuse it anymore from now on.  
+   3. Append token IDs to the slots in existing blocks as well as the new blocks. If a block is full, we add it to the Cache Block to cache it.
+
+**Duplicated blocks**  
+Assuming block size is 4 and you send a request (Request 1\) with prompt ABCDEF and decoding length 3:
+
+```text
+Prompt: [A, B, C, D, E, F]
+Output: [G, H, I]
+
+Time 0:
+  Tokens: [A, B, C, D, E, F, G]
+  Block Table: [0 (ABCD), 1 (EFG)]
+  Cache Blocks: 0
+Time 1:
+  Tokens: [A, B, C, D, E, F, G, H]
+  Block Table: [0 (ABCD), 1 (EFGH)]
+  Cache Blocks: 0, 1
+Time 2:
+  Tokens: [A, B, C, D, E, F, G, H, I]
+  Block Table: [0 (ABCD), 1 (EFGH), 2 (I)]
+  Cache Blocks: 0, 1
+```
+
+Now block 0 and block 1 are cached, and we send the same request again (Request 2\) with greedy sampling, so that it will produce exactly the same outputs as the Request 1:
+
+```text
+Prompt: [A, B, C, D, E, F]
+Output: [G, H, I]
+
+Time 0:
+  Tokens: [A, B, C, D, E, F, G]
+  Block Table: [0 (ABCD), 3 (EFG)]
+  Cache Blocks: 0, 1
+Time 1:
+  Tokens: [A, B, C, D, E, F, G, H]
+  Block Table: [0 (ABCD), 3 (EFGH)]
+  Cache Blocks: 0, 1, 3
+```
+
+As can be seen, block 3 is a new full block and is cached. However, it is redundant as block 1, meaning that we cached the same block twice. In v0, when detecting block 3 is duplicated, we free block 3 and let Request 2 use block 1 instead, so its block table becomes `[0, 1]` in Time 1. However, the block table in vLLM v1 is append-only, meaning that changing the block table from `[0, 3]` to `[0, 1]` is not allowed. As a result, we will have duplicated blocks for the hash key E-H. This duplication will be eliminated when the request is freed.
+
+### Free
+
+When a request is finished, we free all its blocks if no other requests are using them (reference count = 0). In this example, we free request 1 and block 2, 3, 4, 8 associated with it. We can see that the freed blocks are added to the tail of the free queue in the *reverse* order. This is because the last block of a request must hash more tokens and is less likely to be reused by other requests. As a result, it should be evicted first.
+
+:::{image} /assets/design/v1/prefix_caching/free.png
+:alt: Free Queue after Free a Request
+:::
+
+### Eviction (LRU)
+
+When the head block (least recently used block) of the free queue is cached, we have to evict the block to prevent it from being used by other requests. Specifically, eviction involves the following steps:
+
+1. Pop the block from the head of the free queue. This is the LRU black to be evicted.  
+2. Remove the block ID from the Cache Block.  
+3. Remove the block hash.
+
+## Example
+
+In this example, we assume the block size is 4 (each block can cache 4 tokens), and we have 10 blocks in the KV-cache manager in total.
+
+**Time 1: The cache is empty and a new request comes in.** We allocate 4 blocks. 3 of them are already full and cached. The fourth block is partially full with 2 of 4 tokens.
+
+:::{image} /assets/design/v1/prefix_caching/example-time-1.png
+:alt: Example Time 1
+:::
+
+**Time 3: Request 0 makes the block 3 full and asks for a new block to keep decoding.** We cache block 3 and allocate block 4.
+
+:::{image} /assets/design/v1/prefix_caching/example-time-3.png
+:alt: Example Time 3
+:::
+
+**Time 4: Request 1 comes in with the 14 prompt tokens, where the first 11 tokens are the same as request 0.** We can see that only 2 blocks (11 tokens) hit the cache, because the 3rd block only matches 3 of 4 tokens.
+
+:::{image} /assets/design/v1/prefix_caching/example-time-4.png
+:alt: Example Time 4
+:::
+
+**Time 5: Request 0 is finished and free.** Blocks 2, 3 and 4 are added to the free queue in the reverse order (but block 2 and 3 are still cached). Block 0 and 1 are not added to the free queue because they are being used by Request 1.
+
+:::{image} /assets/design/v1/prefix_caching/example-time-5.png
+:alt: Example Time 5
+:::
+
+**Time 6: Request 1 is finished and free.**
+
+:::{image} /assets/design/v1/prefix_caching/example-time-6.png
+:alt: Example Time 6
+:::
+
+**Time 7: Request 2 comes in with the 33 prompt tokens, where the first 16 tokens are the same as request 0\.** Note that even the block order in the free queue was `7 - 8 - 9 - 4 - 3 - 2 - 6 - 5 - 1 - 0`, the cache hit blocks (i.e., 0, 1, 2) are touched and removed from the queue before allocation, so the free queue becomes `7 - 8 - 9 - 4 - 3 - 6 - 5`. As a result, the allocated blocks are 0 (cached), 1 (cached), 2 (cached), 7, 8, 9, 4, 3 (evicted).
+
+:::{image} /assets/design/v1/prefix_caching/example-time-7.png
+:alt: Example Time 7
+:::
diff --git a/docs/source/features/automatic_prefix_caching.md b/docs/source/features/automatic_prefix_caching.md
index 3d70cbb29c385..59016d7fcf6b3 100644
--- a/docs/source/features/automatic_prefix_caching.md
+++ b/docs/source/features/automatic_prefix_caching.md
@@ -6,9 +6,9 @@
 
 Automatic Prefix Caching (APC in short) caches the KV cache of existing queries, so that a new query can directly reuse the KV cache if it shares the same prefix with one of the existing queries, allowing the new query to skip the computation of the shared part.
 
-```{note}
+:::{note}
 Technical details on how vLLM implements APC can be found [here](#design-automatic-prefix-caching).
-```
+:::
 
 ## Enabling APC in vLLM
 
diff --git a/docs/source/features/compatibility_matrix.md b/docs/source/features/compatibility_matrix.md
index 47ab616b30686..b0018ebccf5ba 100644
--- a/docs/source/features/compatibility_matrix.md
+++ b/docs/source/features/compatibility_matrix.md
@@ -4,13 +4,13 @@
 
 The tables below show mutually exclusive features and the support on some hardware.
 
-```{note}
+:::{note}
 Check the '✗' with links to see tracking issue for unsupported feature/hardware combination.
-```
+:::
 
 ## Feature x Feature
 
-```{raw} html
+:::{raw} html
 <style>
   /* Make smaller to try to improve readability  */
   td {
@@ -23,448 +23,447 @@ Check the '✗' with links to see tracking issue for unsupported feature/hardwar
     font-size: 0.8rem;
   }
 </style>
-```
+:::
 
-```{list-table}
-   :header-rows: 1
-   :stub-columns: 1
-   :widths: auto
+:::{list-table}
+:header-rows: 1
+:stub-columns: 1
+:widths: auto
 
-   * - Feature
-     - [CP](#chunked-prefill)
-     - [APC](#automatic-prefix-caching)
-     - [LoRA](#lora-adapter)
-     - <abbr title="Prompt Adapter">prmpt adptr</abbr>
-     - [SD](#spec_decode)
-     - CUDA graph
-     - <abbr title="Pooling Models">pooling</abbr>
-     - <abbr title="Encoder-Decoder Models">enc-dec</abbr>
-     - <abbr title="Logprobs">logP</abbr>
-     - <abbr title="Prompt Logprobs">prmpt logP</abbr>
-     - <abbr title="Async Output Processing">async output</abbr>
-     - multi-step
-     - <abbr title="Multimodal Inputs">mm</abbr>
-     - best-of
-     - beam-search
-     - <abbr title="Guided Decoding">guided dec</abbr>
-   * - [CP](#chunked-prefill)
-     -
-     -
-     -
-     -
-     -
-     -
-     -
-     -
-     -
-     -
-     -
-     -
-     -
-     -
-     -
-     -
-   * - [APC](#automatic-prefix-caching)
-     - ✅
-     -
-     -
-     -
-     -
-     -
-     -
-     -
-     -
-     -
-     -
-     -
-     -
-     -
-     -
-     -
-   * - [LoRA](#lora-adapter)
-     - [✗](gh-pr:9057)
-     - ✅
-     -
-     -
-     -
-     -
-     -
-     -
-     -
-     -
-     -
-     -
-     -
-     -
-     -
-     -
-   * - <abbr title="Prompt Adapter">prmpt adptr</abbr>
-     - ✅
-     - ✅
-     - ✅
-     -
-     -
-     -
-     -
-     -
-     -
-     -
-     -
-     -
-     -
-     -
-     -
-     -
-   * - [SD](#spec_decode)
-     - ✅
-     - ✅
-     - ✗
-     - ✅
-     -
-     -
-     -
-     -
-     -
-     -
-     -
-     -
-     -
-     -
-     -
-     -
-   * - CUDA graph
-     - ✅
-     - ✅
-     - ✅
-     - ✅
-     - ✅
-     -
-     -
-     -
-     -
-     -
-     -
-     -
-     -
-     -
-     -
-     -
-   * - <abbr title="Pooling Models">pooling</abbr>
-     - ✗
-     - ✗
-     - ✗
-     - ✗
-     - ✗
-     - ✗
-     -
-     -
-     -
-     -
-     -
-     -
-     -
-     -
-     -
-     -
-   * - <abbr title="Encoder-Decoder Models">enc-dec</abbr>
-     - ✗
-     - [✗](gh-issue:7366)
-     - ✗
-     - ✗
-     - [✗](gh-issue:7366)
-     - ✅
-     - ✅
-     -
-     -
-     -
-     -
-     -
-     -
-     -
-     -
-     -
-   * - <abbr title="Logprobs">logP</abbr>
-     - ✅
-     - ✅
-     - ✅
-     - ✅
-     - ✅
-     - ✅
-     - ✗
-     - ✅
-     -
-     -
-     -
-     -
-     -
-     -
-     -
-     -
-   * - <abbr title="Prompt Logprobs">prmpt logP</abbr>
-     - ✅
-     - ✅
-     - ✅
-     - ✅
-     - [✗](gh-pr:8199)
-     - ✅
-     - ✗
-     - ✅
-     - ✅
-     -
-     -
-     -
-     -
-     -
-     -
-     -
-   * - <abbr title="Async Output Processing">async output</abbr>
-     - ✅
-     - ✅
-     - ✅
-     - ✅
-     - ✗
-     - ✅
-     - ✗
-     - ✗
-     - ✅
-     - ✅
-     -
-     -
-     -
-     -
-     -
-     -
-   * - multi-step
-     - ✗
-     - ✅
-     - ✗
-     - ✅
-     - ✗
-     - ✅
-     - ✗
-     - ✗
-     - ✅
-     - [✗](gh-issue:8198)
-     - ✅
-     -
-     -
-     -
-     -
-     -
-   * - <abbr title="Multimodal Inputs">mm</abbr>
-     - ✅
-     -  [✗](gh-pr:8348)
-     -  [✗](gh-pr:7199)
-     - ?
-     - ?
-     - ✅
-     - ✅
-     - ✅
-     - ✅
-     - ✅
-     - ✅
-     - ?
-     -
-     -
-     -
-     -
-   * - best-of
-     - ✅
-     - ✅
-     - ✅
-     - ✅
-     - [✗](gh-issue:6137)
-     - ✅
-     - ✗
-     - ✅
-     - ✅
-     - ✅
-     - ?
-     - [✗](gh-issue:7968)
-     - ✅
-     -
-     -
-     -
-   * - beam-search
-     - ✅
-     - ✅
-     - ✅
-     - ✅
-     - [✗](gh-issue:6137)
-     - ✅
-     - ✗
-     - ✅
-     - ✅
-     - ✅
-     - ?
-     - [✗](gh-issue:7968>)
-     - ?
-     - ✅
-     -
-     -
-   * - <abbr title="Guided Decoding">guided dec</abbr>
-     - ✅
-     - ✅
-     - ?
-     - ?
-     - [✗](gh-issue:11484)
-     - ✅
-     - ✗
-     - ?
-     - ✅
-     - ✅
-     - ✅
-     - [✗](gh-issue:9893)
-     - ?
-     - ✅
-     - ✅
-     -
-
-```
+- * Feature
+  * [CP](#chunked-prefill)
+  * [APC](#automatic-prefix-caching)
+  * [LoRA](#lora-adapter)
+  * <abbr title="Prompt Adapter">prmpt adptr</abbr>
+  * [SD](#spec_decode)
+  * CUDA graph
+  * <abbr title="Pooling Models">pooling</abbr>
+  * <abbr title="Encoder-Decoder Models">enc-dec</abbr>
+  * <abbr title="Logprobs">logP</abbr>
+  * <abbr title="Prompt Logprobs">prmpt logP</abbr>
+  * <abbr title="Async Output Processing">async output</abbr>
+  * multi-step
+  * <abbr title="Multimodal Inputs">mm</abbr>
+  * best-of
+  * beam-search
+  * <abbr title="Guided Decoding">guided dec</abbr>
+- * [CP](#chunked-prefill)
+  *
+  *
+  *
+  *
+  *
+  *
+  *
+  *
+  *
+  *
+  *
+  *
+  *
+  *
+  *
+  *
+- * [APC](#automatic-prefix-caching)
+  * ✅
+  *
+  *
+  *
+  *
+  *
+  *
+  *
+  *
+  *
+  *
+  *
+  *
+  *
+  *
+  *
+- * [LoRA](#lora-adapter)
+  * [✗](gh-pr:9057)
+  * ✅
+  *
+  *
+  *
+  *
+  *
+  *
+  *
+  *
+  *
+  *
+  *
+  *
+  *
+  *
+- * <abbr title="Prompt Adapter">prmpt adptr</abbr>
+  * ✅
+  * ✅
+  * ✅
+  *
+  *
+  *
+  *
+  *
+  *
+  *
+  *
+  *
+  *
+  *
+  *
+  *
+- * [SD](#spec_decode)
+  * ✅
+  * ✅
+  * ✗
+  * ✅
+  *
+  *
+  *
+  *
+  *
+  *
+  *
+  *
+  *
+  *
+  *
+  *
+- * CUDA graph
+  * ✅
+  * ✅
+  * ✅
+  * ✅
+  * ✅
+  *
+  *
+  *
+  *
+  *
+  *
+  *
+  *
+  *
+  *
+  *
+- * <abbr title="Pooling Models">pooling</abbr>
+  * ✗
+  * ✗
+  * ✗
+  * ✗
+  * ✗
+  * ✗
+  *
+  *
+  *
+  *
+  *
+  *
+  *
+  *
+  *
+  *
+- * <abbr title="Encoder-Decoder Models">enc-dec</abbr>
+  * ✗
+  * [✗](gh-issue:7366)
+  * ✗
+  * ✗
+  * [✗](gh-issue:7366)
+  * ✅
+  * ✅
+  *
+  *
+  *
+  *
+  *
+  *
+  *
+  *
+  *
+- * <abbr title="Logprobs">logP</abbr>
+  * ✅
+  * ✅
+  * ✅
+  * ✅
+  * ✅
+  * ✅
+  * ✗
+  * ✅
+  *
+  *
+  *
+  *
+  *
+  *
+  *
+  *
+- * <abbr title="Prompt Logprobs">prmpt logP</abbr>
+  * ✅
+  * ✅
+  * ✅
+  * ✅
+  * [✗](gh-pr:8199)
+  * ✅
+  * ✗
+  * ✅
+  * ✅
+  *
+  *
+  *
+  *
+  *
+  *
+  *
+- * <abbr title="Async Output Processing">async output</abbr>
+  * ✅
+  * ✅
+  * ✅
+  * ✅
+  * ✗
+  * ✅
+  * ✗
+  * ✗
+  * ✅
+  * ✅
+  *
+  *
+  *
+  *
+  *
+  *
+- * multi-step
+  * ✗
+  * ✅
+  * ✗
+  * ✅
+  * ✗
+  * ✅
+  * ✗
+  * ✗
+  * ✅
+  * [✗](gh-issue:8198)
+  * ✅
+  *
+  *
+  *
+  *
+  *
+- * <abbr title="Multimodal Inputs">mm</abbr>
+  * ✅
+  * [✗](gh-pr:8348)
+  * [✗](gh-pr:7199)
+  * ?
+  * ?
+  * ✅
+  * ✅
+  * ✅
+  * ✅
+  * ✅
+  * ✅
+  * ?
+  *
+  *
+  *
+  *
+- * best-of
+  * ✅
+  * ✅
+  * ✅
+  * ✅
+  * [✗](gh-issue:6137)
+  * ✅
+  * ✗
+  * ✅
+  * ✅
+  * ✅
+  * ?
+  * [✗](gh-issue:7968)
+  * ✅
+  *
+  *
+  *
+- * beam-search
+  * ✅
+  * ✅
+  * ✅
+  * ✅
+  * [✗](gh-issue:6137)
+  * ✅
+  * ✗
+  * ✅
+  * ✅
+  * ✅
+  * ?
+  * [✗](gh-issue:7968>)
+  * ?
+  * ✅
+  *
+  *
+- * <abbr title="Guided Decoding">guided dec</abbr>
+  * ✅
+  * ✅
+  * ?
+  * ?
+  * [✗](gh-issue:11484)
+  * ✅
+  * ✗
+  * ?
+  * ✅
+  * ✅
+  * ✅
+  * [✗](gh-issue:9893)
+  * ?
+  * ✅
+  * ✅
+  *
+:::
 
 (feature-x-hardware)=
 
 ## Feature x Hardware
 
-```{list-table}
-   :header-rows: 1
-   :stub-columns: 1
-   :widths: auto
+:::{list-table}
+:header-rows: 1
+:stub-columns: 1
+:widths: auto
 
-   * - Feature
-     - Volta
-     - Turing
-     - Ampere
-     - Ada
-     - Hopper
-     - CPU
-     - AMD
-   * - [CP](#chunked-prefill)
-     - [✗](gh-issue:2729)
-     - ✅
-     - ✅
-     - ✅
-     - ✅
-     - ✅
-     - ✅
-   * - [APC](#automatic-prefix-caching)
-     - [✗](gh-issue:3687)
-     - ✅
-     - ✅
-     - ✅
-     - ✅
-     - ✅
-     - ✅
-   * - [LoRA](#lora-adapter)
-     - ✅
-     - ✅
-     - ✅
-     - ✅
-     - ✅
-     - ✅
-     - ✅
-   * - <abbr title="Prompt Adapter">prmpt adptr</abbr>
-     - ✅
-     - ✅
-     - ✅
-     - ✅
-     - ✅
-     - [✗](gh-issue:8475)
-     - ✅
-   * - [SD](#spec_decode)
-     - ✅
-     - ✅
-     - ✅
-     - ✅
-     - ✅
-     - ✅
-     - ✅
-   * - CUDA graph
-     - ✅
-     - ✅
-     - ✅
-     - ✅
-     - ✅
-     - ✗
-     - ✅
-   * - <abbr title="Pooling Models">pooling</abbr>
-     - ✅
-     - ✅
-     - ✅
-     - ✅
-     - ✅
-     - ✅
-     - ?
-   * - <abbr title="Encoder-Decoder Models">enc-dec</abbr>
-     - ✅
-     - ✅
-     - ✅
-     - ✅
-     - ✅
-     - ✅
-     - ✗
-   * - <abbr title="Multimodal Inputs">mm</abbr>
-     - ✅
-     - ✅
-     - ✅
-     - ✅
-     - ✅
-     - ✅
-     - ✅
-   * - <abbr title="Logprobs">logP</abbr>
-     - ✅
-     - ✅
-     - ✅
-     - ✅
-     - ✅
-     - ✅
-     - ✅
-   * - <abbr title="Prompt Logprobs">prmpt logP</abbr>
-     - ✅
-     - ✅
-     - ✅
-     - ✅
-     - ✅
-     - ✅
-     - ✅
-   * - <abbr title="Async Output Processing">async output</abbr>
-     - ✅
-     - ✅
-     - ✅
-     - ✅
-     - ✅
-     - ✗
-     - ✗
-   * - multi-step
-     - ✅
-     - ✅
-     - ✅
-     - ✅
-     - ✅
-     - [✗](gh-issue:8477)
-     - ✅
-   * - best-of
-     - ✅
-     - ✅
-     - ✅
-     - ✅
-     - ✅
-     - ✅
-     - ✅
-   * - beam-search
-     - ✅
-     - ✅
-     - ✅
-     - ✅
-     - ✅
-     - ✅
-     - ✅
-   * - <abbr title="Guided Decoding">guided dec</abbr>
-     - ✅
-     - ✅
-     - ✅
-     - ✅
-     - ✅
-     - ✅
-     - ✅
-```
+- * Feature
+  * Volta
+  * Turing
+  * Ampere
+  * Ada
+  * Hopper
+  * CPU
+  * AMD
+- * [CP](#chunked-prefill)
+  * [✗](gh-issue:2729)
+  * ✅
+  * ✅
+  * ✅
+  * ✅
+  * ✅
+  * ✅
+- * [APC](#automatic-prefix-caching)
+  * [✗](gh-issue:3687)
+  * ✅
+  * ✅
+  * ✅
+  * ✅
+  * ✅
+  * ✅
+- * [LoRA](#lora-adapter)
+  * ✅
+  * ✅
+  * ✅
+  * ✅
+  * ✅
+  * ✅
+  * ✅
+- * <abbr title="Prompt Adapter">prmpt adptr</abbr>
+  * ✅
+  * ✅
+  * ✅
+  * ✅
+  * ✅
+  * [✗](gh-issue:8475)
+  * ✅
+- * [SD](#spec_decode)
+  * ✅
+  * ✅
+  * ✅
+  * ✅
+  * ✅
+  * ✅
+  * ✅
+- * CUDA graph
+  * ✅
+  * ✅
+  * ✅
+  * ✅
+  * ✅
+  * ✗
+  * ✅
+- * <abbr title="Pooling Models">pooling</abbr>
+  * ✅
+  * ✅
+  * ✅
+  * ✅
+  * ✅
+  * ✅
+  * ?
+- * <abbr title="Encoder-Decoder Models">enc-dec</abbr>
+  * ✅
+  * ✅
+  * ✅
+  * ✅
+  * ✅
+  * ✅
+  * ✗
+- * <abbr title="Multimodal Inputs">mm</abbr>
+  * ✅
+  * ✅
+  * ✅
+  * ✅
+  * ✅
+  * ✅
+  * ✅
+- * <abbr title="Logprobs">logP</abbr>
+  * ✅
+  * ✅
+  * ✅
+  * ✅
+  * ✅
+  * ✅
+  * ✅
+- * <abbr title="Prompt Logprobs">prmpt logP</abbr>
+  * ✅
+  * ✅
+  * ✅
+  * ✅
+  * ✅
+  * ✅
+  * ✅
+- * <abbr title="Async Output Processing">async output</abbr>
+  * ✅
+  * ✅
+  * ✅
+  * ✅
+  * ✅
+  * ✗
+  * ✗
+- * multi-step
+  * ✅
+  * ✅
+  * ✅
+  * ✅
+  * ✅
+  * [✗](gh-issue:8477)
+  * ✅
+- * best-of
+  * ✅
+  * ✅
+  * ✅
+  * ✅
+  * ✅
+  * ✅
+  * ✅
+- * beam-search
+  * ✅
+  * ✅
+  * ✅
+  * ✅
+  * ✅
+  * ✅
+  * ✅
+- * <abbr title="Guided Decoding">guided dec</abbr>
+  * ✅
+  * ✅
+  * ✅
+  * ✅
+  * ✅
+  * ✅
+  * ✅
+:::
diff --git a/docs/source/features/disagg_prefill.md b/docs/source/features/disagg_prefill.md
index efa2efc66192e..52d253b9c2b18 100644
--- a/docs/source/features/disagg_prefill.md
+++ b/docs/source/features/disagg_prefill.md
@@ -4,9 +4,9 @@
 
 This page introduces you the disaggregated prefilling feature in vLLM.
 
-```{note}
+:::{note}
 This feature is experimental and subject to change.
-```
+:::
 
 ## Why disaggregated prefilling?
 
@@ -15,9 +15,9 @@ Two main reasons:
 - **Tuning time-to-first-token (TTFT) and inter-token-latency (ITL) separately**. Disaggregated prefilling put prefill and decode phase of LLM inference inside different vLLM instances. This gives you the flexibility to assign different parallel strategies (e.g. `tp` and `pp`) to tune TTFT without affecting ITL, or to tune ITL without affecting TTFT.
 - **Controlling tail ITL**. Without disaggregated prefilling, vLLM may insert some prefill jobs during the decoding of one request. This results in higher tail latency. Disaggregated prefilling helps you solve this issue and control tail ITL. Chunked prefill with a proper chunk size also can achieve the same goal, but in practice it's hard to figure out the correct chunk size value. So disaggregated prefilling is a much more reliable way to control tail ITL.
 
-```{note}
+:::{note}
 Disaggregated prefill DOES NOT improve throughput.
-```
+:::
 
 ## Usage example
 
@@ -39,21 +39,21 @@ Key abstractions for disaggregated prefilling:
 - **LookupBuffer**: LookupBuffer provides two API: `insert` KV cache and `drop_select` KV cache. The semantics of `insert` and `drop_select` are similar to SQL, where `insert` inserts a KV cache into the buffer, and `drop_select` returns the KV cache that matches the given condition and drop it from the buffer.
 - **Pipe**: A single-direction FIFO pipe for tensor transmission. It supports `send_tensor` and `recv_tensor`.
 
-```{note}
+:::{note}
 `insert` is non-blocking operation but `drop_select` is blocking operation.
-```
+:::
 
 Here is a figure illustrating how the above 3 abstractions are organized:
 
-```{image} /assets/features/disagg_prefill/abstraction.jpg
+:::{image} /assets/features/disagg_prefill/abstraction.jpg
 :alt: Disaggregated prefilling abstractions
-```
+:::
 
 The workflow of disaggregated prefilling is as follows:
 
-```{image} /assets/features/disagg_prefill/overview.jpg
+:::{image} /assets/features/disagg_prefill/overview.jpg
 :alt: Disaggregated prefilling workflow
-```
+:::
 
 The `buffer` corresponds to `insert` API in LookupBuffer, and the `drop_select` corresponds to `drop_select` API in LookupBuffer.
 
diff --git a/docs/source/features/lora.md b/docs/source/features/lora.md
index b00d05147bb32..fb5a7a0d519cb 100644
--- a/docs/source/features/lora.md
+++ b/docs/source/features/lora.md
@@ -60,9 +60,9 @@ vllm serve meta-llama/Llama-2-7b-hf \
     --lora-modules sql-lora=$HOME/.cache/huggingface/hub/models--yard1--llama-2-7b-sql-lora-test/snapshots/0dfa347e8877a4d4ed19ee56c140fa518470028c/
 ```
 
-```{note}
+:::{note}
 The commit ID `0dfa347e8877a4d4ed19ee56c140fa518470028c` may change over time. Please check the latest commit ID in your environment to ensure you are using the correct one.
-```
+:::
 
 The server entrypoint accepts all other LoRA configuration parameters (`max_loras`, `max_lora_rank`, `max_cpu_loras`,
 etc.), which will apply to all forthcoming requests. Upon querying the `/models` endpoint, we should see our LoRA along
diff --git a/docs/source/features/quantization/auto_awq.md b/docs/source/features/quantization/auto_awq.md
index 404505eb3890e..30735b1161ff3 100644
--- a/docs/source/features/quantization/auto_awq.md
+++ b/docs/source/features/quantization/auto_awq.md
@@ -2,11 +2,11 @@
 
 # AutoAWQ
 
-```{warning}
+:::{warning}
 Please note that AWQ support in vLLM is under-optimized at the moment. We would recommend using the unquantized version of the model for better
 accuracy and higher throughput. Currently, you can use AWQ as a way to reduce memory footprint. As of now, it is more suitable for low latency
 inference with small number of concurrent requests. vLLM's AWQ implementation have lower throughput than unquantized version.
-```
+:::
 
 To create a new 4-bit quantized model, you can leverage [AutoAWQ](https://github.com/casper-hansen/AutoAWQ).
 Quantizing reduces the model's precision from FP16 to INT4 which effectively reduces the file size by ~70%.
diff --git a/docs/source/features/quantization/fp8.md b/docs/source/features/quantization/fp8.md
index 1398e8a324201..a62e0124b7706 100644
--- a/docs/source/features/quantization/fp8.md
+++ b/docs/source/features/quantization/fp8.md
@@ -14,10 +14,10 @@ The FP8 types typically supported in hardware have two distinct representations,
 - **E4M3**: Consists of 1 sign bit, 4 exponent bits, and 3 bits of mantissa. It can store values up to +/-448 and `nan`.
 - **E5M2**: Consists of 1 sign bit, 5 exponent bits, and 2 bits of mantissa. It can store values up to +/-57344, +/- `inf`, and `nan`. The tradeoff for the increased dynamic range is lower precision of the stored values.
 
-```{note}
+:::{note}
 FP8 computation is supported on NVIDIA GPUs with compute capability > 8.9 (Ada Lovelace, Hopper).
 FP8 models will run on compute capability > 8.0 (Ampere) as weight-only W8A16, utilizing FP8 Marlin.
-```
+:::
 
 ## Quick Start with Online Dynamic Quantization
 
@@ -32,9 +32,9 @@ model = LLM("facebook/opt-125m", quantization="fp8")
 result = model.generate("Hello, my name is")
 ```
 
-```{warning}
+:::{warning}
 Currently, we load the model at original precision before quantizing down to 8-bits, so you need enough memory to load the whole model.
-```
+:::
 
 ## Installation
 
@@ -110,9 +110,9 @@ model.generate("Hello my name is")
 
 Evaluate accuracy with `lm_eval` (for example on 250 samples of `gsm8k`):
 
-```{note}
+:::{note}
 Quantized models can be sensitive to the presence of the `bos` token. `lm_eval` does not add a `bos` token by default, so make sure to include the `add_bos_token=True` argument when running your evaluations.
-```
+:::
 
 ```console
 $ MODEL=$PWD/Meta-Llama-3-8B-Instruct-FP8-Dynamic
@@ -137,10 +137,10 @@ If you encounter any issues or have feature requests, please open an issue on th
 
 ## Deprecated Flow
 
-```{note}
+:::{note}
 The following information is preserved for reference and search purposes.
 The quantization method described below is deprecated in favor of the `llmcompressor` method described above.
-```
+:::
 
 For static per-tensor offline quantization to FP8, please install the [AutoFP8 library](https://github.com/neuralmagic/autofp8).
 
diff --git a/docs/source/features/quantization/gguf.md b/docs/source/features/quantization/gguf.md
index 640997cf4bc39..65c181900f9be 100644
--- a/docs/source/features/quantization/gguf.md
+++ b/docs/source/features/quantization/gguf.md
@@ -2,13 +2,13 @@
 
 # GGUF
 
-```{warning}
+:::{warning}
 Please note that GGUF support in vLLM is highly experimental and under-optimized at the moment, it might be incompatible with other features. Currently, you can use GGUF as a way to reduce memory footprint. If you encounter any issues, please report them to the vLLM team.
-```
+:::
 
-```{warning}
+:::{warning}
 Currently, vllm only supports loading single-file GGUF models. If you have a multi-files GGUF model, you can use [gguf-split](https://github.com/ggerganov/llama.cpp/pull/6135) tool to merge them to a single-file model.
-```
+:::
 
 To run a GGUF model with vLLM, you can download and use the local GGUF model from [TheBloke/TinyLlama-1.1B-Chat-v1.0-GGUF](https://huggingface.co/TheBloke/TinyLlama-1.1B-Chat-v1.0-GGUF) with the following command:
 
@@ -25,9 +25,9 @@ You can also add `--tensor-parallel-size 2` to enable tensor parallelism inferen
 vllm serve ./tinyllama-1.1b-chat-v1.0.Q4_K_M.gguf --tokenizer TinyLlama/TinyLlama-1.1B-Chat-v1.0 --tensor-parallel-size 2
 ```
 
-```{warning}
+:::{warning}
 We recommend using the tokenizer from base model instead of GGUF model. Because the tokenizer conversion from GGUF is time-consuming and unstable, especially for some models with large vocab size.
-```
+:::
 
 You can also use the GGUF model directly through the LLM entrypoint:
 
diff --git a/docs/source/features/quantization/index.md b/docs/source/features/quantization/index.md
index 56ccdb5f00c34..1c98620aa2145 100644
--- a/docs/source/features/quantization/index.md
+++ b/docs/source/features/quantization/index.md
@@ -4,7 +4,7 @@
 
 Quantization trades off model precision for smaller memory footprint, allowing large models to be run on a wider range of devices.
 
-```{toctree}
+:::{toctree}
 :caption: Contents
 :maxdepth: 1
 
@@ -12,7 +12,8 @@ supported_hardware
 auto_awq
 bnb
 gguf
+int4
 int8
 fp8
 quantized_kvcache
-```
+:::
diff --git a/docs/source/features/quantization/int4.md b/docs/source/features/quantization/int4.md
new file mode 100644
index 0000000000000..f8939e5bf0150
--- /dev/null
+++ b/docs/source/features/quantization/int4.md
@@ -0,0 +1,166 @@
+(int4)=
+
+# INT4 W4A16
+
+vLLM supports quantizing weights to INT4 for memory savings and inference acceleration. This quantization method is particularly useful for reducing model size and maintaining low latency in workloads with low queries per second (QPS).
+
+Please visit the HF collection of [quantized INT4 checkpoints of popular LLMs ready to use with vLLM](https://huggingface.co/collections/neuralmagic/int4-llms-for-vllm-668ec34bf3c9fa45f857df2c).
+
+:::{note}
+INT4 computation is supported on NVIDIA GPUs with compute capability > 8.0 (Ampere, Ada Lovelace, Hopper, Blackwell).
+:::
+
+## Prerequisites
+
+To use INT4 quantization with vLLM, you'll need to install the [llm-compressor](https://github.com/vllm-project/llm-compressor/) library:
+
+```console
+pip install llmcompressor
+```
+
+## Quantization Process
+
+The quantization process involves four main steps:
+
+1. Loading the model
+2. Preparing calibration data
+3. Applying quantization
+4. Evaluating accuracy in vLLM
+
+### 1. Loading the Model
+
+Load your model and tokenizer using the standard `transformers` AutoModel classes:
+
+```python
+from transformers import AutoTokenizer, AutoModelForCausalLM
+
+MODEL_ID = "meta-llama/Meta-Llama-3-8B-Instruct"
+model = AutoModelForCausalLM.from_pretrained(
+    MODEL_ID, device_map="auto", torch_dtype="auto",
+)
+tokenizer = AutoTokenizer.from_pretrained(MODEL_ID)
+```
+
+### 2. Preparing Calibration Data
+
+When quantizing weights to INT4, you need sample data to estimate the weight updates and calibrated scales.
+It's best to use calibration data that closely matches your deployment data.
+For a general-purpose instruction-tuned model, you can use a dataset like `ultrachat`:
+
+```python
+from datasets import load_dataset
+
+NUM_CALIBRATION_SAMPLES = 512
+MAX_SEQUENCE_LENGTH = 2048
+
+# Load and preprocess the dataset
+ds = load_dataset("HuggingFaceH4/ultrachat_200k", split="train_sft")
+ds = ds.shuffle(seed=42).select(range(NUM_CALIBRATION_SAMPLES))
+
+def preprocess(example):
+    return {"text": tokenizer.apply_chat_template(example["messages"], tokenize=False)}
+ds = ds.map(preprocess)
+
+def tokenize(sample):
+    return tokenizer(sample["text"], padding=False, max_length=MAX_SEQUENCE_LENGTH, truncation=True, add_special_tokens=False)
+ds = ds.map(tokenize, remove_columns=ds.column_names)
+```
+
+### 3. Applying Quantization
+
+Now, apply the quantization algorithms:
+
+```python
+from llmcompressor.transformers import oneshot
+from llmcompressor.modifiers.quantization import GPTQModifier
+from llmcompressor.modifiers.smoothquant import SmoothQuantModifier
+
+# Configure the quantization algorithms
+recipe = GPTQModifier(targets="Linear", scheme="W4A16", ignore=["lm_head"])
+
+# Apply quantization
+oneshot(
+    model=model,
+    dataset=ds,
+    recipe=recipe,
+    max_seq_length=MAX_SEQUENCE_LENGTH,
+    num_calibration_samples=NUM_CALIBRATION_SAMPLES,
+)
+
+# Save the compressed model
+SAVE_DIR = MODEL_ID.split("/")[1] + "-W4A16-G128"
+model.save_pretrained(SAVE_DIR, save_compressed=True)
+tokenizer.save_pretrained(SAVE_DIR)
+```
+
+This process creates a W4A16 model with weights quantized to 4-bit integers.
+
+### 4. Evaluating Accuracy
+
+After quantization, you can load and run the model in vLLM:
+
+```python
+from vllm import LLM
+model = LLM("./Meta-Llama-3-8B-Instruct-W4A16-G128")
+```
+
+To evaluate accuracy, you can use `lm_eval`:
+
+```console
+$ lm_eval --model vllm \
+  --model_args pretrained="./Meta-Llama-3-8B-Instruct-W4A16-G128",add_bos_token=true \
+  --tasks gsm8k \
+  --num_fewshot 5 \
+  --limit 250 \
+  --batch_size 'auto'
+```
+
+:::{note}
+Quantized models can be sensitive to the presence of the `bos` token. Make sure to include the `add_bos_token=True` argument when running evaluations.
+:::
+
+## Best Practices
+
+- Start with 512 samples for calibration data, and increase if accuracy drops
+- Ensure the calibration data contains a high variety of samples to prevent overfitting towards a specific use case
+- Use a sequence length of 2048 as a starting point
+- Employ the chat template or instruction template that the model was trained with
+- If you've fine-tuned a model, consider using a sample of your training data for calibration
+- Tune key hyperparameters to the quantization algorithm:
+  - `dampening_frac` sets how much influence the GPTQ algorithm has. Lower values can improve accuracy, but can lead to numerical instabilities that cause the algorithm to fail.
+  - `actorder` sets the activation ordering. When compressing the weights of a layer weight, the order in which channels are quantized matters. Setting `actorder="weight"` can improve accuracy without added latency.
+
+The following is an example of an expanded quantization recipe you can tune to your own use case:
+
+```python
+from compressed_tensors.quantization import (
+    QuantizationArgs,
+    QuantizationScheme,
+    QuantizationStrategy,
+    QuantizationType,
+) 
+recipe = GPTQModifier(
+    targets="Linear",
+    config_groups={
+        "config_group": QuantizationScheme(
+            targets=["Linear"],
+            weights=QuantizationArgs(
+                num_bits=4,
+                type=QuantizationType.INT,
+                strategy=QuantizationStrategy.GROUP,
+                group_size=128,
+                symmetric=True,
+                dynamic=False,
+                actorder="weight",
+            ),
+        ),
+    },
+    ignore=["lm_head"],
+    update_size=NUM_CALIBRATION_SAMPLES,
+    dampening_frac=0.01
+)
+```
+
+## Troubleshooting and Support
+
+If you encounter any issues or have feature requests, please open an issue on the [`vllm-project/llm-compressor`](https://github.com/vllm-project/llm-compressor) GitHub repository. The full INT4 quantization example in `llm-compressor` is available [here](https://github.com/vllm-project/llm-compressor/blob/main/examples/quantization_w4a16/llama3_example.py).
diff --git a/docs/source/features/quantization/int8.md b/docs/source/features/quantization/int8.md
index 592a60d3988b2..b381f34bccd34 100644
--- a/docs/source/features/quantization/int8.md
+++ b/docs/source/features/quantization/int8.md
@@ -7,9 +7,9 @@ This quantization method is particularly useful for reducing model size while ma
 
 Please visit the HF collection of [quantized INT8 checkpoints of popular LLMs ready to use with vLLM](https://huggingface.co/collections/neuralmagic/int8-llms-for-vllm-668ec32c049dca0369816415).
 
-```{note}
-INT8 computation is supported on NVIDIA GPUs with compute capability > 7.5 (Turing, Ampere, Ada Lovelace, Hopper).
-```
+:::{note}
+INT8 computation is supported on NVIDIA GPUs with compute capability > 7.5 (Turing, Ampere, Ada Lovelace, Hopper, Blackwell).
+:::
 
 ## Prerequisites
 
@@ -119,9 +119,9 @@ $ lm_eval --model vllm \
   --batch_size 'auto'
 ```
 
-```{note}
+:::{note}
 Quantized models can be sensitive to the presence of the `bos` token. Make sure to include the `add_bos_token=True` argument when running evaluations.
-```
+:::
 
 ## Best Practices
 
@@ -132,4 +132,4 @@ Quantized models can be sensitive to the presence of the `bos` token. Make sure
 
 ## Troubleshooting and Support
 
-If you encounter any issues or have feature requests, please open an issue on the `vllm-project/llm-compressor` GitHub repository.
+If you encounter any issues or have feature requests, please open an issue on the [`vllm-project/llm-compressor`](https://github.com/vllm-project/llm-compressor) GitHub repository.
diff --git a/docs/source/features/quantization/supported_hardware.md b/docs/source/features/quantization/supported_hardware.md
index f5c0a95ea426e..555ed4ce4c8db 100644
--- a/docs/source/features/quantization/supported_hardware.md
+++ b/docs/source/features/quantization/supported_hardware.md
@@ -4,128 +4,129 @@
 
 The table below shows the compatibility of various quantization implementations with different hardware platforms in vLLM:
 
-```{list-table}
+:::{list-table}
 :header-rows: 1
 :widths: 20 8 8 8 8 8 8 8 8 8 8
 
-* - Implementation
-  - Volta
-  - Turing
-  - Ampere
-  - Ada
-  - Hopper
-  - AMD GPU
-  - Intel GPU
-  - x86 CPU
-  - AWS Inferentia
-  - Google TPU
-* - AWQ
-  - ✗
-  - ✅︎
-  - ✅︎
-  - ✅︎
-  - ✅︎
-  - ✗
-  - ✅︎
-  - ✅︎
-  - ✗
-  - ✗
-* - GPTQ
-  - ✅︎
-  - ✅︎
-  - ✅︎
-  - ✅︎
-  - ✅︎
-  - ✗
-  - ✅︎
-  - ✅︎
-  - ✗
-  - ✗
-* - Marlin (GPTQ/AWQ/FP8)
-  - ✗
-  - ✗
-  - ✅︎
-  - ✅︎
-  - ✅︎
-  - ✗
-  - ✗
-  - ✗
-  - ✗
-  - ✗
-* - INT8 (W8A8)
-  - ✗
-  - ✅︎
-  - ✅︎
-  - ✅︎
-  - ✅︎
-  - ✗
-  - ✗
-  - ✅︎
-  - ✗
-  - ✗
-* - FP8 (W8A8)
-  - ✗
-  - ✗
-  - ✗
-  - ✅︎
-  - ✅︎
-  - ✅︎
-  - ✗
-  - ✗
-  - ✗
-  - ✗
-* - AQLM
-  - ✅︎
-  - ✅︎
-  - ✅︎
-  - ✅︎
-  - ✅︎
-  - ✗
-  - ✗
-  - ✗
-  - ✗
-  - ✗
-* - bitsandbytes
-  - ✅︎
-  - ✅︎
-  - ✅︎
-  - ✅︎
-  - ✅︎
-  - ✗
-  - ✗
-  - ✗
-  - ✗
-  - ✗
-* - DeepSpeedFP
-  - ✅︎
-  - ✅︎
-  - ✅︎
-  - ✅︎
-  - ✅︎
-  - ✗
-  - ✗
-  - ✗
-  - ✗
-  - ✗
-* - GGUF
-  - ✅︎
-  - ✅︎
-  - ✅︎
-  - ✅︎
-  - ✅︎
-  - ✅︎
-  - ✗
-  - ✗
-  - ✗
-  - ✗
-```
+- * Implementation
+  * Volta
+  * Turing
+  * Ampere
+  * Ada
+  * Hopper
+  * AMD GPU
+  * Intel GPU
+  * x86 CPU
+  * AWS Inferentia
+  * Google TPU
+- * AWQ
+  * ✗
+  * ✅︎
+  * ✅︎
+  * ✅︎
+  * ✅︎
+  * ✗
+  * ✅︎
+  * ✅︎
+  * ✗
+  * ✗
+- * GPTQ
+  * ✅︎
+  * ✅︎
+  * ✅︎
+  * ✅︎
+  * ✅︎
+  * ✗
+  * ✅︎
+  * ✅︎
+  * ✗
+  * ✗
+- * Marlin (GPTQ/AWQ/FP8)
+  * ✗
+  * ✗
+  * ✅︎
+  * ✅︎
+  * ✅︎
+  * ✗
+  * ✗
+  * ✗
+  * ✗
+  * ✗
+- * INT8 (W8A8)
+  * ✗
+  * ✅︎
+  * ✅︎
+  * ✅︎
+  * ✅︎
+  * ✗
+  * ✗
+  * ✅︎
+  * ✗
+  * ✗
+- * FP8 (W8A8)
+  * ✗
+  * ✗
+  * ✗
+  * ✅︎
+  * ✅︎
+  * ✅︎
+  * ✗
+  * ✗
+  * ✗
+  * ✗
+- * AQLM
+  * ✅︎
+  * ✅︎
+  * ✅︎
+  * ✅︎
+  * ✅︎
+  * ✗
+  * ✗
+  * ✗
+  * ✗
+  * ✗
+- * bitsandbytes
+  * ✅︎
+  * ✅︎
+  * ✅︎
+  * ✅︎
+  * ✅︎
+  * ✗
+  * ✗
+  * ✗
+  * ✗
+  * ✗
+- * DeepSpeedFP
+  * ✅︎
+  * ✅︎
+  * ✅︎
+  * ✅︎
+  * ✅︎
+  * ✗
+  * ✗
+  * ✗
+  * ✗
+  * ✗
+- * GGUF
+  * ✅︎
+  * ✅︎
+  * ✅︎
+  * ✅︎
+  * ✅︎
+  * ✅︎
+  * ✗
+  * ✗
+  * ✗
+  * ✗
+
+:::
 
 - Volta refers to SM 7.0, Turing to SM 7.5, Ampere to SM 8.0/8.6, Ada to SM 8.9, and Hopper to SM 9.0.
 - "✅︎" indicates that the quantization method is supported on the specified hardware.
 - "✗" indicates that the quantization method is not supported on the specified hardware.
 
-```{note}
+:::{note}
 This compatibility chart is subject to change as vLLM continues to evolve and expand its support for different hardware platforms and quantization methods.
 
 For the most up-to-date information on hardware support and quantization methods, please refer to <gh-dir:vllm/model_executor/layers/quantization> or consult with the vLLM development team.
-```
+:::
diff --git a/docs/source/features/reasoning_outputs.md b/docs/source/features/reasoning_outputs.md
new file mode 100644
index 0000000000000..e39bbacf1138d
--- /dev/null
+++ b/docs/source/features/reasoning_outputs.md
@@ -0,0 +1,151 @@
+(reasoning-outputs)=
+
+# Reasoning Outputs
+
+vLLM offers support for reasoning models like [DeepSeek R1](https://huggingface.co/deepseek-ai/DeepSeek-R1), which are designed to generate outputs containing both reasoning steps and final conclusions.
+
+Reasoning models return a additional `reasoning_content` field in their outputs, which contains the reasoning steps that led to the final conclusion. This field is not present in the outputs of other models.
+
+## Supported Models
+
+vLLM currently supports the following reasoning models:
+
+- [DeepSeek R1 series](https://huggingface.co/collections/deepseek-ai/deepseek-r1-678e1e131c0169c0bc89728d) (`deepseek_r1`, which looks for `<think> ... </think>`)
+
+## Quickstart
+
+To use reasoning models, you need to specify the `--enable-reasoning` and `--reasoning-parser` flags when making a request to the chat completion endpoint. The `--reasoning-parser` flag specifies the reasoning parser to use for extracting reasoning content from the model output.
+
+```bash
+vllm serve deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B \
+    --enable-reasoning --reasoning-parser deepseek_r1
+```
+
+Next, make a request to the model that should return the reasoning content in the response.
+
+```python
+from openai import OpenAI
+
+# Modify OpenAI's API key and API base to use vLLM's API server.
+openai_api_key = "EMPTY"
+openai_api_base = "http://localhost:8000/v1"
+
+client = OpenAI(
+    api_key=openai_api_key,
+    base_url=openai_api_base,
+)
+
+models = client.models.list()
+model = models.data[0].id
+
+# Round 1
+messages = [{"role": "user", "content": "9.11 and 9.8, which is greater?"}]
+response = client.chat.completions.create(model=model, messages=messages)
+
+reasoning_content = response.choices[0].message.reasoning_content
+content = response.choices[0].message.content
+
+print("reasoning_content:", reasoning_content)
+print("content:", content)
+```
+
+The `reasoning_content` field contains the reasoning steps that led to the final conclusion, while the `content` field contains the final conclusion.
+
+## Streaming chat completions
+
+Streaming chat completions are also supported for reasoning models. The `reasoning_content` field is available in the `delta` field in [chat completion response chunks](https://platform.openai.com/docs/api-reference/chat/streaming).
+
+```json
+{
+    "id": "chatcmpl-123",
+    "object": "chat.completion.chunk",
+    "created": 1694268190,
+    "model": "deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B",
+    "system_fingerprint": "fp_44709d6fcb",
+    "choices": [
+        {
+            "index": 0,
+            "delta": {
+                "role": "assistant",
+                "reasoning_content": "is",
+            },
+            "logprobs": null,
+            "finish_reason": null
+        }
+    ]
+}
+```
+
+Please note that it is not compatible with the OpenAI Python client library. You can use the `requests` library to make streaming requests.
+
+## How to support a new reasoning model
+
+You can add a new `ReasoningParser` similar to `vllm/entrypoints/openai/reasoning_parsers/deepseek_r1_reasoning_parser.py`.
+
+```python
+# import the required packages
+
+from vllm.entrypoints.openai.reasoning_parsers.abs_reasoning_parsers import (
+    ReasoningParser, ReasoningParserManager)
+from vllm.entrypoints.openai.protocol import (ChatCompletionRequest,
+                                              DeltaMessage)
+
+# define a reasoning parser and register it to vllm
+# the name list in register_module can be used
+# in --reasoning-parser.
+@ReasoningParserManager.register_module(["example"])
+class ExampleParser(ReasoningParser):
+    def __init__(self, tokenizer: AnyTokenizer):
+        super().__init__(tokenizer)
+
+    def extract_reasoning_content_streaming(
+        self,
+        previous_text: str,
+        current_text: str,
+        delta_text: str,
+        previous_token_ids: Sequence[int],
+        current_token_ids: Sequence[int],
+        delta_token_ids: Sequence[int],
+    ) -> Union[DeltaMessage, None]:
+        """
+        Instance method that should be implemented for extracting reasoning
+        from an incomplete response; for use when handling reasoning calls and
+        streaming. Has to be an instance method because  it requires state -
+        the current tokens/diffs, but also the information about what has
+        previously been parsed and extracted (see constructor)
+        """
+
+    def extract_reasoning_content(
+            self, model_output: str, request: ChatCompletionRequest
+    ) -> Tuple[Optional[str], Optional[str]]:
+        """
+        Extract reasoning content from a complete model-generated string.
+
+        Used for non-streaming responses where we have the entire model response
+        available before sending to the client.
+
+        Parameters:
+        model_output: str
+            The model-generated string to extract reasoning content from.
+
+        request: ChatCompletionRequest
+            The request object that was used to generate the model_output.
+
+        Returns:
+        Tuple[Optional[str], Optional[str]]
+            A tuple containing the reasoning content and the content.
+        """
+```
+
+After defining the reasoning parser, you can use it by specifying the `--reasoning-parser` flag when making a request to the chat completion endpoint.
+
+```bash
+vllm serve <model_tag> \
+    --enable-reasoning --reasoning-parser example
+```
+
+## Limitations
+
+- The reasoning content is only available for online serving's chat completion endpoint (`/v1/chat/completions`).
+- It is not compatible with the [`structured_outputs`](#structured_outputs) and [`tool_calling`](#tool_calling) features.
+- The reasoning content is not available for all models. Check the model's documentation to see if it supports reasoning.
diff --git a/docs/source/features/spec_decode.md b/docs/source/features/spec_decode.md
index ab7b2f302bd13..da87127057dc5 100644
--- a/docs/source/features/spec_decode.md
+++ b/docs/source/features/spec_decode.md
@@ -2,15 +2,15 @@
 
 # Speculative Decoding
 
-```{warning}
+:::{warning}
 Please note that speculative decoding in vLLM is not yet optimized and does
 not usually yield inter-token latency reductions for all prompt datasets or sampling parameters.
 The work to optimize it is ongoing and can be followed here: <gh-issue:4630>
-```
+:::
 
-```{warning}
+:::{warning}
 Currently, speculative decoding in vLLM is not compatible with pipeline parallelism.
-```
+:::
 
 This document shows how to use [Speculative Decoding](https://x.com/karpathy/status/1697318534555336961) with vLLM.
 Speculative decoding is a technique which improves inter-token latency in memory-bound LLM inference.
diff --git a/docs/source/features/structured_outputs.md b/docs/source/features/structured_outputs.md
index 1d77c7339a33f..90c880e8cfa46 100644
--- a/docs/source/features/structured_outputs.md
+++ b/docs/source/features/structured_outputs.md
@@ -95,10 +95,10 @@ completion = client.chat.completions.create(
 print(completion.choices[0].message.content)
 ```
 
-```{tip}
+:::{tip}
 While not strictly necessary, normally it´s better to indicate in the prompt that a JSON needs to be generated and which fields and how should the LLM fill them.
 This can improve the results notably in most cases.
-```
+:::
 
 Finally we have the `guided_grammar`, which probably is the most difficult one to use but it´s really powerful, as it allows us to define complete languages like SQL queries.
 It works by using a context free EBNF grammar, which for example we can use to define a specific format of simplified SQL queries, like in the example below:
diff --git a/docs/source/generate_examples.py b/docs/source/generate_examples.py
index aaa13d0fb6d3f..9d4de18a3b79d 100644
--- a/docs/source/generate_examples.py
+++ b/docs/source/generate_examples.py
@@ -1,3 +1,5 @@
+# SPDX-License-Identifier: Apache-2.0
+
 import itertools
 import re
 from dataclasses import dataclass, field
@@ -57,9 +59,9 @@ class Index:
 
     def generate(self) -> str:
         content = f"# {self.title}\n\n{self.description}\n\n"
-        content += "```{toctree}\n"
+        content += ":::{toctree}\n"
         content += f":caption: {self.caption}\n:maxdepth: {self.maxdepth}\n"
-        content += "\n".join(self.documents) + "\n```\n"
+        content += "\n".join(self.documents) + "\n:::\n"
         return content
 
 
diff --git a/docs/source/getting_started/installation/ai_accelerator/hpu-gaudi.inc.md b/docs/source/getting_started/installation/ai_accelerator/hpu-gaudi.inc.md
index ae42dd0c0d08f..f3b0d6dc9bdc8 100644
--- a/docs/source/getting_started/installation/ai_accelerator/hpu-gaudi.inc.md
+++ b/docs/source/getting_started/installation/ai_accelerator/hpu-gaudi.inc.md
@@ -2,6 +2,10 @@
 
 This tab provides instructions on running vLLM with Intel Gaudi devices.
 
+:::{attention}
+There are no pre-built wheels or images for this device, so you must build vLLM from source.
+:::
+
 ## Requirements
 
 - OS: Ubuntu 22.04 LTS
@@ -86,9 +90,9 @@ docker build -f Dockerfile.hpu -t vllm-hpu-env  .
 docker run -it --runtime=habana -e HABANA_VISIBLE_DEVICES=all -e OMPI_MCA_btl_vader_single_copy_mechanism=none --cap-add=sys_nice --net=host --rm vllm-hpu-env
 ```
 
-```{tip}
+:::{tip}
 If you're observing the following error: `docker: Error response from daemon: Unknown runtime specified habana.`, please refer to "Install Using Containers" section of [Intel Gaudi Software Stack and Driver Installation](https://docs.habana.ai/en/v1.18.0/Installation_Guide/Bare_Metal_Fresh_OS.html). Make sure you have `habana-container-runtime` package installed and that `habana` container runtime is registered.
-```
+:::
 
 ## Extra information
 
@@ -155,30 +159,30 @@ Gaudi2 devices. Configurations that are not listed may or may not work.
 
 Currently in vLLM for HPU we support four execution modes, depending on selected HPU PyTorch Bridge backend (via `PT_HPU_LAZY_MODE` environment variable), and `--enforce-eager` flag.
 
-```{list-table} vLLM execution modes
+:::{list-table} vLLM execution modes
 :widths: 25 25 50
 :header-rows: 1
 
-* - `PT_HPU_LAZY_MODE`
-  - `enforce_eager`
-  - execution mode
-* - 0
-  - 0
-  - torch.compile
-* - 0
-  - 1
-  - PyTorch eager mode
-* - 1
-  - 0
-  - HPU Graphs
-* - 1
-  - 1
-  - PyTorch lazy mode
-```
+- * `PT_HPU_LAZY_MODE`
+  * `enforce_eager`
+  * execution mode
+- * 0
+  * 0
+  * torch.compile
+- * 0
+  * 1
+  * PyTorch eager mode
+- * 1
+  * 0
+  * HPU Graphs
+- * 1
+  * 1
+  * PyTorch lazy mode
+:::
 
-```{warning}
+:::{warning}
 In 1.18.0, all modes utilizing `PT_HPU_LAZY_MODE=0` are highly experimental and should be only used for validating functional correctness. Their performance will be improved in the next releases. For obtaining the best performance in 1.18.0, please use HPU Graphs, or PyTorch lazy mode.
-```
+:::
 
 (gaudi-bucketing-mechanism)=
 
@@ -187,9 +191,9 @@ In 1.18.0, all modes utilizing `PT_HPU_LAZY_MODE=0` are highly experimental and
 Intel Gaudi accelerators work best when operating on models with fixed tensor shapes. [Intel Gaudi Graph Compiler](https://docs.habana.ai/en/latest/Gaudi_Overview/Intel_Gaudi_Software_Suite.html#graph-compiler-and-runtime) is responsible for generating optimized binary code that implements the given model topology on Gaudi. In its default configuration, the produced binary code may be heavily dependent on input and output tensor shapes, and can require graph recompilation when encountering differently shaped tensors within the same topology. While the resulting binaries utilize Gaudi efficiently, the compilation itself may introduce a noticeable overhead in end-to-end execution.
 In a dynamic inference serving scenario, there is a need to minimize the number of graph compilations and reduce the risk of graph compilation occurring during server runtime. Currently it is achieved by "bucketing" model's forward pass across two dimensions - `batch_size` and `sequence_length`.
 
-```{note}
+:::{note}
 Bucketing allows us to reduce the number of required graphs significantly, but it does not handle any graph compilation and device code generation - this is done in warmup and HPUGraph capture phase.
-```
+:::
 
 Bucketing ranges are determined with 3 parameters - `min`, `step` and `max`. They can be set separately for prompt and decode phase, and for batch size and sequence length dimension. These parameters can be observed in logs during vLLM startup:
 
@@ -222,15 +226,15 @@ min = 128, step = 128, max = 512
 
 In the logged scenario, 24 buckets were generated for prompt (prefill) runs, and 48 buckets for decode runs. Each bucket corresponds to a separate optimized device binary for a given model with specified tensor shapes. Whenever a batch of requests is processed, it is padded across batch and sequence length dimension to the smallest possible bucket.
 
-```{warning}
+:::{warning}
 If a request exceeds maximum bucket size in any dimension, it will be processed without padding, and its processing may require a graph compilation, potentially significantly increasing end-to-end latency. The boundaries of the buckets are user-configurable via environment variables, and upper bucket boundaries can be increased to avoid such scenario.
-```
+:::
 
 As an example, if a request of 3 sequences, with max sequence length of 412 comes in to an idle vLLM server, it will be padded executed as `(4, 512)` prefill bucket, as `batch_size` (number of sequences) will be padded to 4 (closest batch_size dimension higher than 3), and max sequence length will be padded to 512 (closest sequence length dimension higher than 412). After prefill stage, it will be executed as `(4, 512)` decode bucket and will continue as that bucket until either batch dimension changes (due to request being finished) - in which case it will become a `(2, 512)` bucket, or context length increases above 512 tokens, in which case it will become `(4, 640)` bucket.
 
-```{note}
+:::{note}
 Bucketing is transparent to a client -- padding in sequence length dimension is never returned to the client, and padding in batch dimension does not create new requests.
-```
+:::
 
 ### Warmup
 
@@ -252,9 +256,9 @@ INFO 08-01 22:27:16 hpu_model_runner.py:1066] [Warmup][Decode][48/48] batch_size
 
 This example uses the same buckets as in the [Bucketing Mechanism](#gaudi-bucketing-mechanism) section. Each output line corresponds to execution of a single bucket. When bucket is executed for the first time, its graph is compiled and can be reused later on, skipping further graph compilations.
 
-```{tip}
+:::{tip}
 Compiling all the buckets might take some time and can be turned off with `VLLM_SKIP_WARMUP=true` environment variable. Keep in mind that if you do that, you may face graph compilations once executing a given bucket for the first time. It is fine to disable warmup for development, but it's highly recommended to enable it in deployment.
-```
+:::
 
 ### HPU Graph capture
 
@@ -269,9 +273,9 @@ With its default value (`VLLM_GRAPH_RESERVED_MEM=0.1`), 10% of usable memory wil
 Environment variable `VLLM_GRAPH_PROMPT_RATIO` determines the ratio of usable graph memory reserved for prefill and decode graphs. By default (`VLLM_GRAPH_PROMPT_RATIO=0.3`), both stages have equal memory constraints.
 Lower value corresponds to less usable graph memory reserved for prefill stage, e.g. `VLLM_GRAPH_PROMPT_RATIO=0.2` will reserve 20% of usable graph memory for prefill graphs, and 80% of usable graph memory for decode graphs.
 
-```{note}
+:::{note}
 `gpu_memory_utilization` does not correspond to the absolute memory usage across HPU. It specifies the memory margin after loading the model and performing a profile run. If device has 100 GiB of total memory, and 50 GiB of free memory after loading model weights and executing profiling run, `gpu_memory_utilization` at its default value will mark 90% of 50 GiB as usable, leaving 5 GiB of margin, regardless of total device memory.
-```
+:::
 
 User can also configure the strategy for capturing HPU Graphs for prompt and decode stages separately. Strategy affects the order of capturing graphs. There are two strategies implemented:
 \- `max_bs` - graph capture queue will sorted in descending order by their batch sizes. Buckets with equal batch sizes are sorted by sequence length in ascending order (e.g. `(64, 128)`, `(64, 256)`, `(32, 128)`, `(32, 256)`, `(1, 128)`, `(1,256)`), default strategy for decode
@@ -279,9 +283,9 @@ User can also configure the strategy for capturing HPU Graphs for prompt and dec
 
 When there's large amount of requests pending, vLLM scheduler will attempt to fill the maximum batch size for decode as soon as possible. When a request is finished, decode batch size decreases. When that happens, vLLM will attempt to schedule a prefill iteration for requests in the waiting queue, to fill the decode batch size to its previous state. This means that in a full load scenario, decode batch size is often at its maximum, which makes large batch size HPU Graphs crucial to capture, as reflected by `max_bs` strategy. On the other hand, prefills will be executed most frequently with very low batch sizes (1-4), which is reflected in `min_tokens` strategy.
 
-```{note}
+:::{note}
 `VLLM_GRAPH_PROMPT_RATIO` does not set a hard limit on memory taken by graphs for each stage (prefill and decode). vLLM will first attempt to use up entirety of usable prefill graph memory (usable graph memory * `VLLM_GRAPH_PROMPT_RATIO`) for capturing prefill HPU Graphs, next it will attempt do the same for decode graphs and usable decode graph memory pool. If one stage is fully captured, and there is unused memory left within usable graph memory pool, vLLM will attempt further graph capture for the other stage, until no more HPU Graphs can be captured without exceeding reserved memory pool. The behavior on that mechanism can be observed in the example below.
-```
+:::
 
 Each described step is logged by vLLM server, as follows (negative values correspond to memory being released):
 
@@ -352,13 +356,13 @@ INFO 08-02 17:38:43 hpu_executor.py:91] init_cache_engine took 37.92 GiB of devi
 
 - `VLLM_{phase}_{dim}_BUCKET_{param}` - collection of 12 environment variables configuring ranges of bucketing mechanism
 
-  - `{phase}` is either `PROMPT` or `DECODE`
+  * `{phase}` is either `PROMPT` or `DECODE`
 
-  - `{dim}` is either `BS`, `SEQ` or `BLOCK`
+  * `{dim}` is either `BS`, `SEQ` or `BLOCK`
 
-  - `{param}` is either `MIN`, `STEP` or `MAX`
+  * `{param}` is either `MIN`, `STEP` or `MAX`
 
-  - Default values:
+  * Default values:
 
     - Prompt:
       - batch size min (`VLLM_PROMPT_BS_BUCKET_MIN`): `1`
diff --git a/docs/source/getting_started/installation/ai_accelerator/index.md b/docs/source/getting_started/installation/ai_accelerator/index.md
index a6c4c44305a4c..01793572fee7c 100644
--- a/docs/source/getting_started/installation/ai_accelerator/index.md
+++ b/docs/source/getting_started/installation/ai_accelerator/index.md
@@ -2,374 +2,375 @@
 
 vLLM is a Python library that supports the following AI accelerators. Select your AI accelerator type to see vendor specific instructions:
 
-::::{tab-set}
+:::::{tab-set}
 :sync-group: device
 
-:::{tab-item} TPU
+::::{tab-item} Google TPU
+:selected:
 :sync: tpu
 
-```{include} tpu.inc.md
+:::{include} tpu.inc.md
 :start-after: "# Installation"
 :end-before: "## Requirements"
-```
-
-:::
-
-:::{tab-item} Intel Gaudi
-:sync: hpu-gaudi
-
-```{include} hpu-gaudi.inc.md
-:start-after: "# Installation"
-:end-before: "## Requirements"
-```
-
-:::
-
-:::{tab-item} Neuron
-:sync: neuron
-
-```{include} neuron.inc.md
-:start-after: "# Installation"
-:end-before: "## Requirements"
-```
-
-:::
-
-:::{tab-item} OpenVINO
-:sync: openvino
-
-```{include} openvino.inc.md
-:start-after: "# Installation"
-:end-before: "## Requirements"
-```
-
 :::
 
 ::::
 
+::::{tab-item} Intel Gaudi
+:sync: hpu-gaudi
+
+:::{include} hpu-gaudi.inc.md
+:start-after: "# Installation"
+:end-before: "## Requirements"
+:::
+
+::::
+
+::::{tab-item} AWS Neuron
+:sync: neuron
+
+:::{include} neuron.inc.md
+:start-after: "# Installation"
+:end-before: "## Requirements"
+:::
+
+::::
+
+::::{tab-item} OpenVINO
+:sync: openvino
+
+:::{include} openvino.inc.md
+:start-after: "# Installation"
+:end-before: "## Requirements"
+:::
+
+::::
+
+:::::
+
 ## Requirements
 
-::::{tab-set}
+:::::{tab-set}
 :sync-group: device
 
-:::{tab-item} TPU
+::::{tab-item} Google TPU
 :sync: tpu
 
-```{include} tpu.inc.md
+:::{include} tpu.inc.md
 :start-after: "## Requirements"
 :end-before: "## Configure a new environment"
-```
-
-:::
-
-:::{tab-item} Intel Gaudi
-:sync: hpu-gaudi
-
-```{include} hpu-gaudi.inc.md
-:start-after: "## Requirements"
-:end-before: "## Configure a new environment"
-```
-
-:::
-
-:::{tab-item} Neuron
-:sync: neuron
-
-```{include} neuron.inc.md
-:start-after: "## Requirements"
-:end-before: "## Configure a new environment"
-```
-
-:::
-
-:::{tab-item} OpenVINO
-:sync: openvino
-
-```{include} openvino.inc.md
-:start-after: "## Requirements"
-:end-before: "## Set up using Python"
-```
-
 :::
 
 ::::
 
+::::{tab-item} Intel Gaudi
+:sync: hpu-gaudi
+
+:::{include} hpu-gaudi.inc.md
+:start-after: "## Requirements"
+:end-before: "## Configure a new environment"
+:::
+
+::::
+
+::::{tab-item} AWS Neuron
+:sync: neuron
+
+:::{include} neuron.inc.md
+:start-after: "## Requirements"
+:end-before: "## Configure a new environment"
+:::
+
+::::
+
+::::{tab-item} OpenVINO
+:sync: openvino
+
+:::{include} openvino.inc.md
+:start-after: "## Requirements"
+:end-before: "## Set up using Python"
+:::
+
+::::
+
+:::::
+
 ## Configure a new environment
 
-::::{tab-set}
+:::::{tab-set}
 :sync-group: device
 
-:::{tab-item} TPU
+::::{tab-item} Google TPU
 :sync: tpu
 
-```{include} tpu.inc.md
+:::{include} tpu.inc.md
 :start-after: "## Configure a new environment"
 :end-before: "## Set up using Python"
-```
-
-:::
-
-:::{tab-item} Intel Gaudi
-:sync: hpu-gaudi
-
-```{include} hpu-gaudi.inc.md
-:start-after: "## Configure a new environment"
-:end-before: "## Set up using Python"
-```
-
-:::
-
-:::{tab-item} Neuron
-:sync: neuron
-
-```{include} neuron.inc.md
-:start-after: "## Configure a new environment"
-:end-before: "## Set up using Python"
-```
-
-:::
-
-:::{tab-item} OpenVINO
-:sync: openvino
-
-```{include} ../python_env_setup.inc.md
-```
-
 :::
 
 ::::
 
+::::{tab-item} Intel Gaudi
+:sync: hpu-gaudi
+
+:::{include} hpu-gaudi.inc.md
+:start-after: "## Configure a new environment"
+:end-before: "## Set up using Python"
+:::
+
+::::
+
+::::{tab-item} AWS Neuron
+:sync: neuron
+
+:::{include} neuron.inc.md
+:start-after: "## Configure a new environment"
+:end-before: "## Set up using Python"
+:::
+
+::::
+
+::::{tab-item} OpenVINO
+:sync: openvino
+
+:::{include} ../python_env_setup.inc.md
+:::
+
+::::
+
+:::::
+
 ## Set up using Python
 
 ### Pre-built wheels
 
-::::{tab-set}
+:::::{tab-set}
 :sync-group: device
 
-:::{tab-item} TPU
+::::{tab-item} Google TPU
 :sync: tpu
 
-```{include} tpu.inc.md
+:::{include} tpu.inc.md
 :start-after: "### Pre-built wheels"
 :end-before: "### Build wheel from source"
-```
-
-:::
-
-:::{tab-item} Intel Gaudi
-:sync: hpu-gaudi
-
-```{include} hpu-gaudi.inc.md
-:start-after: "### Pre-built wheels"
-:end-before: "### Build wheel from source"
-```
-
-:::
-
-:::{tab-item} Neuron
-:sync: neuron
-
-```{include} neuron.inc.md
-:start-after: "### Pre-built wheels"
-:end-before: "### Build wheel from source"
-```
-
-:::
-
-:::{tab-item} OpenVINO
-:sync: openvino
-
-```{include} openvino.inc.md
-:start-after: "### Pre-built wheels"
-:end-before: "### Build wheel from source"
-```
-
 :::
 
 ::::
 
+::::{tab-item} Intel Gaudi
+:sync: hpu-gaudi
+
+:::{include} hpu-gaudi.inc.md
+:start-after: "### Pre-built wheels"
+:end-before: "### Build wheel from source"
+:::
+
+::::
+
+::::{tab-item} AWS Neuron
+:sync: neuron
+
+:::{include} neuron.inc.md
+:start-after: "### Pre-built wheels"
+:end-before: "### Build wheel from source"
+:::
+
+::::
+
+::::{tab-item} OpenVINO
+:sync: openvino
+
+:::{include} openvino.inc.md
+:start-after: "### Pre-built wheels"
+:end-before: "### Build wheel from source"
+:::
+
+::::
+
+:::::
+
 ### Build wheel from source
 
-::::{tab-set}
+:::::{tab-set}
 :sync-group: device
 
-:::{tab-item} TPU
+::::{tab-item} Google TPU
 :sync: tpu
 
-```{include} tpu.inc.md
+:::{include} tpu.inc.md
 :start-after: "### Build wheel from source"
 :end-before: "## Set up using Docker"
-```
-
-:::
-
-:::{tab-item} Intel Gaudi
-:sync: hpu-gaudi
-
-```{include} hpu-gaudi.inc.md
-:start-after: "### Build wheel from source"
-:end-before: "## Set up using Docker"
-```
-
-:::
-
-:::{tab-item} Neuron
-:sync: neuron
-
-```{include} neuron.inc.md
-:start-after: "### Build wheel from source"
-:end-before: "## Set up using Docker"
-```
-
-:::
-
-:::{tab-item} OpenVINO
-:sync: openvino
-
-```{include} openvino.inc.md
-:start-after: "### Build wheel from source"
-:end-before: "## Set up using Docker"
-```
-
 :::
 
 ::::
 
+::::{tab-item} Intel Gaudi
+:sync: hpu-gaudi
+
+:::{include} hpu-gaudi.inc.md
+:start-after: "### Build wheel from source"
+:end-before: "## Set up using Docker"
+:::
+
+::::
+
+::::{tab-item} AWS Neuron
+:sync: neuron
+
+:::{include} neuron.inc.md
+:start-after: "### Build wheel from source"
+:end-before: "## Set up using Docker"
+:::
+
+::::
+
+::::{tab-item} OpenVINO
+:sync: openvino
+
+:::{include} openvino.inc.md
+:start-after: "### Build wheel from source"
+:end-before: "## Set up using Docker"
+:::
+
+::::
+
+:::::
+
 ## Set up using Docker
 
 ### Pre-built images
 
-::::{tab-set}
+:::::{tab-set}
 :sync-group: device
 
-:::{tab-item} TPU
+::::{tab-item} Google TPU
 :sync: tpu
 
-```{include} tpu.inc.md
+:::{include} tpu.inc.md
 :start-after: "### Pre-built images"
 :end-before: "### Build image from source"
-```
-
-:::
-
-:::{tab-item} Intel Gaudi
-:sync: hpu-gaudi
-
-```{include} hpu-gaudi.inc.md
-:start-after: "### Pre-built images"
-:end-before: "### Build image from source"
-```
-
-:::
-
-:::{tab-item} Neuron
-:sync: neuron
-
-```{include} neuron.inc.md
-:start-after: "### Pre-built images"
-:end-before: "### Build image from source"
-```
-
-:::
-
-:::{tab-item} OpenVINO
-:sync: openvino
-
-```{include} openvino.inc.md
-:start-after: "### Pre-built images"
-:end-before: "### Build image from source"
-```
-
 :::
 
 ::::
 
+::::{tab-item} Intel Gaudi
+:sync: hpu-gaudi
+
+:::{include} hpu-gaudi.inc.md
+:start-after: "### Pre-built images"
+:end-before: "### Build image from source"
+:::
+
+::::
+
+::::{tab-item} AWS Neuron
+:sync: neuron
+
+:::{include} neuron.inc.md
+:start-after: "### Pre-built images"
+:end-before: "### Build image from source"
+:::
+
+::::
+
+::::{tab-item} OpenVINO
+:sync: openvino
+
+:::{include} openvino.inc.md
+:start-after: "### Pre-built images"
+:end-before: "### Build image from source"
+:::
+
+::::
+
+:::::
+
 ### Build image from source
 
-::::{tab-set}
+:::::{tab-set}
 :sync-group: device
 
-:::{tab-item} TPU
+::::{tab-item} Google TPU
 :sync: tpu
 
-```{include} tpu.inc.md
+:::{include} tpu.inc.md
 :start-after: "### Build image from source"
 :end-before: "## Extra information"
-```
-
-:::
-
-:::{tab-item} Intel Gaudi
-:sync: hpu-gaudi
-
-```{include} hpu-gaudi.inc.md
-:start-after: "### Build image from source"
-:end-before: "## Extra information"
-```
-
-:::
-
-:::{tab-item} Neuron
-:sync: neuron
-
-```{include} neuron.inc.md
-:start-after: "### Build image from source"
-:end-before: "## Extra information"
-```
-
-:::
-
-:::{tab-item} OpenVINO
-:sync: openvino
-
-```{include} openvino.inc.md
-:start-after: "### Build image from source"
-:end-before: "## Extra information"
-```
-
 :::
 
 ::::
 
+::::{tab-item} Intel Gaudi
+:sync: hpu-gaudi
+
+:::{include} hpu-gaudi.inc.md
+:start-after: "### Build image from source"
+:end-before: "## Extra information"
+:::
+
+::::
+
+::::{tab-item} AWS Neuron
+:sync: neuron
+
+:::{include} neuron.inc.md
+:start-after: "### Build image from source"
+:end-before: "## Extra information"
+:::
+
+::::
+
+::::{tab-item} OpenVINO
+:sync: openvino
+
+:::{include} openvino.inc.md
+:start-after: "### Build image from source"
+:end-before: "## Extra information"
+:::
+
+::::
+
+:::::
+
 ## Extra information
 
-::::{tab-set}
+:::::{tab-set}
 :sync-group: device
 
-:::{tab-item} TPU
+::::{tab-item} Google TPU
 :sync: tpu
 
-```{include} tpu.inc.md
+:::{include} tpu.inc.md
 :start-after: "## Extra information"
-```
-
-:::
-
-:::{tab-item} Intel Gaudi
-:sync: hpu-gaudi
-
-```{include} hpu-gaudi.inc.md
-:start-after: "## Extra information"
-```
-
-:::
-
-:::{tab-item} Neuron
-:sync: neuron
-
-```{include} neuron.inc.md
-:start-after: "## Extra information"
-```
-
-:::
-
-:::{tab-item} OpenVINO
-:sync: openvino
-
-```{include} openvino.inc.md
-:start-after: "## Extra information"
-```
-
 :::
 
 ::::
+
+::::{tab-item} Intel Gaudi
+:sync: hpu-gaudi
+
+:::{include} hpu-gaudi.inc.md
+:start-after: "## Extra information"
+:::
+
+::::
+
+::::{tab-item} AWS Neuron
+:sync: neuron
+
+:::{include} neuron.inc.md
+:start-after: "## Extra information"
+:::
+
+::::
+
+::::{tab-item} OpenVINO
+:sync: openvino
+
+:::{include} openvino.inc.md
+:start-after: "## Extra information"
+:::
+
+::::
+
+:::::
diff --git a/docs/source/getting_started/installation/ai_accelerator/neuron.inc.md b/docs/source/getting_started/installation/ai_accelerator/neuron.inc.md
index 575a9f9c2e2f0..f149818acafb8 100644
--- a/docs/source/getting_started/installation/ai_accelerator/neuron.inc.md
+++ b/docs/source/getting_started/installation/ai_accelerator/neuron.inc.md
@@ -4,6 +4,10 @@ vLLM 0.3.3 onwards supports model inferencing and serving on AWS Trainium/Infere
 Paged Attention and Chunked Prefill are currently in development and will be available soon.
 Data types currently supported in Neuron SDK are FP16 and BF16.
 
+:::{attention}
+There are no pre-built wheels or images for this device, so you must build vLLM from source.
+:::
+
 ## Requirements
 
 - OS: Linux
@@ -67,9 +71,9 @@ Currently, there are no pre-built Neuron wheels.
 
 ### Build wheel from source
 
-```{note}
+:::{note}
 The currently supported version of Pytorch for Neuron installs `triton` version `2.1.0`. This is incompatible with `vllm >= 0.5.3`. You may see an error `cannot import name 'default_dump_dir...`. To work around this, run a `pip install --upgrade triton==3.0.0` after installing the vLLM wheel.
-```
+:::
 
 Following instructions are applicable to Neuron SDK 2.16 and beyond.
 
diff --git a/docs/source/getting_started/installation/ai_accelerator/openvino.inc.md b/docs/source/getting_started/installation/ai_accelerator/openvino.inc.md
index a7867472583d6..112e8d4d9b256 100644
--- a/docs/source/getting_started/installation/ai_accelerator/openvino.inc.md
+++ b/docs/source/getting_started/installation/ai_accelerator/openvino.inc.md
@@ -2,6 +2,10 @@
 
 vLLM powered by OpenVINO supports all LLM models from [vLLM supported models list](#supported-models) and can perform optimal model serving on all x86-64 CPUs with, at least, AVX2 support, as well as on both integrated and discrete Intel® GPUs ([the list of supported GPUs](https://docs.openvino.ai/2024/about-openvino/release-notes-openvino/system-requirements.html#gpu)).
 
+:::{attention}
+There are no pre-built wheels or images for this device, so you must build vLLM from source.
+:::
+
 ## Requirements
 
 - OS: Linux
diff --git a/docs/source/getting_started/installation/ai_accelerator/tpu.inc.md b/docs/source/getting_started/installation/ai_accelerator/tpu.inc.md
index 6a911cc6b9eba..c0d50feafce56 100644
--- a/docs/source/getting_started/installation/ai_accelerator/tpu.inc.md
+++ b/docs/source/getting_started/installation/ai_accelerator/tpu.inc.md
@@ -30,6 +30,10 @@ For TPU pricing information, see [Cloud TPU pricing](https://cloud.google.com/tp
 You may need additional persistent storage for your TPU VMs. For more
 information, see [Storage options for Cloud TPU data](https://cloud.devsite.corp.google.com/tpu/docs/storage-options).
 
+:::{attention}
+There are no pre-built wheels for this device, so you must either use the pre-built Docker image or build vLLM from source.
+:::
+
 ## Requirements
 
 - Google Cloud TPU VM
@@ -47,10 +51,10 @@ When you request queued resources, the request is added to a queue maintained by
 the Cloud TPU service. When the requested resource becomes available, it's
 assigned to your Google Cloud project for your immediate exclusive use.
 
-```{note}
+:::{note}
 In all of the following commands, replace the ALL CAPS parameter names with
 appropriate values. See the parameter descriptions table for more information.
-```
+:::
 
 ### Provision Cloud TPUs with GKE
 
@@ -75,33 +79,33 @@ gcloud alpha compute tpus queued-resources create QUEUED_RESOURCE_ID \
 --service-account SERVICE_ACCOUNT
 ```
 
-```{list-table} Parameter descriptions
+:::{list-table} Parameter descriptions
 :header-rows: 1
 
-* - Parameter name
-  - Description
-* - QUEUED_RESOURCE_ID
-  - The user-assigned ID of the queued resource request.
-* - TPU_NAME
-  - The user-assigned name of the TPU which is created when the queued
+- * Parameter name
+  * Description
+- * QUEUED_RESOURCE_ID
+  * The user-assigned ID of the queued resource request.
+- * TPU_NAME
+  * The user-assigned name of the TPU which is created when the queued
     resource request is allocated.
-* - PROJECT_ID
-  - Your Google Cloud project
-* - ZONE
-  - The GCP zone where you want to create your Cloud TPU. The value you use
+- * PROJECT_ID
+  * Your Google Cloud project
+- * ZONE
+  * The GCP zone where you want to create your Cloud TPU. The value you use
     depends on the version of TPUs you are using. For more information, see
     `TPU regions and zones <https://cloud.google.com/tpu/docs/regions-zones>`_
-* - ACCELERATOR_TYPE
-  - The TPU version you want to use. Specify the TPU version, for example
+- * ACCELERATOR_TYPE
+  * The TPU version you want to use. Specify the TPU version, for example
     `v5litepod-4` specifies a v5e TPU with 4 cores. For more information,
     see `TPU versions <https://cloud.devsite.corp.google.com/tpu/docs/system-architecture-tpu-vm#versions>`_.
-* - RUNTIME_VERSION
-  - The TPU VM runtime version to use. For more information see `TPU VM images <https://cloud.google.com/tpu/docs/runtimes>`_.
-* - SERVICE_ACCOUNT
-  - The email address for your service account. You can find it in the IAM
+- * RUNTIME_VERSION
+  * The TPU VM runtime version to use. For more information see `TPU VM images <https://cloud.google.com/tpu/docs/runtimes>`_.
+- * SERVICE_ACCOUNT
+  * The email address for your service account. You can find it in the IAM
     Cloud Console under *Service Accounts*. For example:
     `tpu-service-account@<your_project_ID>.iam.gserviceaccount.com`
-```
+:::
 
 Connect to your TPU using SSH:
 
@@ -178,15 +182,15 @@ Run the Docker image with the following command:
 docker run --privileged --net host --shm-size=16G -it vllm-tpu
 ```
 
-```{note}
+:::{note}
 Since TPU relies on XLA which requires static shapes, vLLM bucketizes the
 possible input shapes and compiles an XLA graph for each shape. The
 compilation time may take 20~30 minutes in the first run. However, the
 compilation time reduces to ~5 minutes afterwards because the XLA graphs are
 cached in the disk (in {code}`VLLM_XLA_CACHE_PATH` or {code}`~/.cache/vllm/xla_cache` by default).
-```
+:::
 
-````{tip}
+:::{tip}
 If you encounter the following error:
 
 ```console
@@ -198,9 +202,10 @@ file or directory
 Install OpenBLAS with the following command:
 
 ```console
-$ sudo apt-get install libopenblas-base libopenmpi-dev libomp-dev
+sudo apt-get install libopenblas-base libopenmpi-dev libomp-dev
 ```
-````
+
+:::
 
 ## Extra information
 
diff --git a/docs/source/getting_started/installation/cpu/apple.inc.md b/docs/source/getting_started/installation/cpu/apple.inc.md
index 56545253b1ef7..3bf1d47fa0ff9 100644
--- a/docs/source/getting_started/installation/cpu/apple.inc.md
+++ b/docs/source/getting_started/installation/cpu/apple.inc.md
@@ -4,6 +4,10 @@ vLLM has experimental support for macOS with Apple silicon. For now, users shall
 
 Currently the CPU implementation for macOS supports FP32 and FP16 datatypes.
 
+:::{attention}
+There are no pre-built wheels or images for this device, so you must build vLLM from source.
+:::
+
 ## Requirements
 
 - OS: `macOS Sonoma` or later
@@ -25,9 +29,9 @@ pip install -r requirements-cpu.txt
 pip install -e . 
 ```
 
-```{note}
+:::{note}
 On macOS the `VLLM_TARGET_DEVICE` is automatically set to `cpu`, which currently is the only supported device.
-```
+:::
 
 #### Troubleshooting
 
diff --git a/docs/source/getting_started/installation/cpu/arm.inc.md b/docs/source/getting_started/installation/cpu/arm.inc.md
index 08a764e1a25f4..a661a0ca5adc7 100644
--- a/docs/source/getting_started/installation/cpu/arm.inc.md
+++ b/docs/source/getting_started/installation/cpu/arm.inc.md
@@ -4,6 +4,10 @@ vLLM has been adapted to work on ARM64 CPUs with NEON support, leveraging the CP
 
 ARM CPU backend currently supports Float32, FP16 and BFloat16 datatypes.
 
+:::{attention}
+There are no pre-built wheels or images for this device, so you must build vLLM from source.
+:::
+
 ## Requirements
 
 - OS: Linux
diff --git a/docs/source/getting_started/installation/cpu/index.md b/docs/source/getting_started/installation/cpu/index.md
index 4ec907c0e9fda..d53430403583c 100644
--- a/docs/source/getting_started/installation/cpu/index.md
+++ b/docs/source/getting_started/installation/cpu/index.md
@@ -2,86 +2,87 @@
 
 vLLM is a Python library that supports the following CPU variants. Select your CPU type to see vendor specific instructions:
 
-::::{tab-set}
+:::::{tab-set}
 :sync-group: device
 
-:::{tab-item} x86
+::::{tab-item} Intel/AMD x86
+:selected:
 :sync: x86
 
-```{include} x86.inc.md
+:::{include} x86.inc.md
 :start-after: "# Installation"
 :end-before: "## Requirements"
-```
-
-:::
-
-:::{tab-item} ARM
-:sync: arm
-
-```{include} arm.inc.md
-:start-after: "# Installation"
-:end-before: "## Requirements"
-```
-
-:::
-
-:::{tab-item} Apple silicon
-:sync: apple
-
-```{include} apple.inc.md
-:start-after: "# Installation"
-:end-before: "## Requirements"
-```
-
 :::
 
 ::::
 
+::::{tab-item} ARM AArch64
+:sync: arm
+
+:::{include} arm.inc.md
+:start-after: "# Installation"
+:end-before: "## Requirements"
+:::
+
+::::
+
+::::{tab-item} Apple silicon
+:sync: apple
+
+:::{include} apple.inc.md
+:start-after: "# Installation"
+:end-before: "## Requirements"
+:::
+
+::::
+
+:::::
+
 ## Requirements
 
 - Python: 3.9 -- 3.12
 
-::::{tab-set}
+:::::{tab-set}
 :sync-group: device
 
-:::{tab-item} x86
+::::{tab-item} Intel/AMD x86
 :sync: x86
 
-```{include} x86.inc.md
+:::{include} x86.inc.md
 :start-after: "## Requirements"
 :end-before: "## Set up using Python"
-```
-
-:::
-
-:::{tab-item} ARM
-:sync: arm
-
-```{include} arm.inc.md
-:start-after: "## Requirements"
-:end-before: "## Set up using Python"
-```
-
-:::
-
-:::{tab-item} Apple silicon
-:sync: apple
-
-```{include} apple.inc.md
-:start-after: "## Requirements"
-:end-before: "## Set up using Python"
-```
-
 :::
 
 ::::
 
+::::{tab-item} ARM AArch64
+:sync: arm
+
+:::{include} arm.inc.md
+:start-after: "## Requirements"
+:end-before: "## Set up using Python"
+:::
+
+::::
+
+::::{tab-item} Apple silicon
+:sync: apple
+
+:::{include} apple.inc.md
+:start-after: "## Requirements"
+:end-before: "## Set up using Python"
+:::
+
+::::
+
+:::::
+
 ## Set up using Python
 
 ### Create a new Python environment
 
-```{include} ../python_env_setup.inc.md
-```
+:::{include} ../python_env_setup.inc.md
+:::
 
 ### Pre-built wheels
 
@@ -89,41 +90,41 @@ Currently, there are no pre-built CPU wheels.
 
 ### Build wheel from source
 
-::::{tab-set}
+:::::{tab-set}
 :sync-group: device
 
-:::{tab-item} x86
+::::{tab-item} Intel/AMD x86
 :sync: x86
 
-```{include} x86.inc.md
+:::{include} x86.inc.md
 :start-after: "### Build wheel from source"
 :end-before: "## Set up using Docker"
-```
-
-:::
-
-:::{tab-item} ARM
-:sync: arm
-
-```{include} arm.inc.md
-:start-after: "### Build wheel from source"
-:end-before: "## Set up using Docker"
-```
-
-:::
-
-:::{tab-item} Apple silicon
-:sync: apple
-
-```{include} apple.inc.md
-:start-after: "### Build wheel from source"
-:end-before: "## Set up using Docker"
-```
-
 :::
 
 ::::
 
+::::{tab-item} ARM AArch64
+:sync: arm
+
+:::{include} arm.inc.md
+:start-after: "### Build wheel from source"
+:end-before: "## Set up using Docker"
+:::
+
+::::
+
+::::{tab-item} Apple silicon
+:sync: apple
+
+:::{include} apple.inc.md
+:start-after: "### Build wheel from source"
+:end-before: "## Set up using Docker"
+:::
+
+::::
+
+:::::
+
 ## Set up using Docker
 
 ### Pre-built images
@@ -142,9 +143,9 @@ $ docker run -it \
              vllm-cpu-env
 ```
 
-:::{tip}
+::::{tip}
 For ARM or Apple silicon, use `Dockerfile.arm`
-:::
+::::
 
 ## Supported features
 
diff --git a/docs/source/getting_started/installation/cpu/x86.inc.md b/docs/source/getting_started/installation/cpu/x86.inc.md
index e4f99d3cebdf2..1dafc3660060e 100644
--- a/docs/source/getting_started/installation/cpu/x86.inc.md
+++ b/docs/source/getting_started/installation/cpu/x86.inc.md
@@ -2,12 +2,20 @@
 
 vLLM initially supports basic model inferencing and serving on x86 CPU platform, with data types FP32, FP16 and BF16.
 
+:::{attention}
+There are no pre-built wheels or images for this device, so you must build vLLM from source.
+:::
+
 ## Requirements
 
 - OS: Linux
 - Compiler: `gcc/g++ >= 12.3.0` (optional, recommended)
 - Instruction Set Architecture (ISA): AVX512 (optional, recommended)
 
+:::{tip}
+[Intel Extension for PyTorch (IPEX)](https://github.com/intel/intel-extension-for-pytorch) extends PyTorch with up-to-date features optimizations for an extra performance boost on Intel hardware.
+:::
+
 ## Set up using Python
 
 ### Pre-built wheels
@@ -17,10 +25,10 @@ vLLM initially supports basic model inferencing and serving on x86 CPU platform,
 :::{include} build.inc.md
 :::
 
-```{note}
-- AVX512_BF16 is an extension ISA provides native BF16 data type conversion and vector product instructions, will brings some performance improvement compared with pure AVX512. The CPU backend build script will check the host CPU flags to determine whether to enable AVX512_BF16.
+:::{note}
+- AVX512_BF16 is an extension ISA provides native BF16 data type conversion and vector product instructions, which brings some performance improvement compared with pure AVX512. The CPU backend build script will check the host CPU flags to determine whether to enable AVX512_BF16.
 - If you want to force enable AVX512_BF16 for the cross-compilation, please set environment variable `VLLM_CPU_AVX512BF16=1` before the building.
-```
+:::
 
 ## Set up using Docker
 
@@ -29,7 +37,3 @@ vLLM initially supports basic model inferencing and serving on x86 CPU platform,
 ### Build image from source
 
 ## Extra information
-
-## Intel Extension for PyTorch
-
-- [Intel Extension for PyTorch (IPEX)](https://github.com/intel/intel-extension-for-pytorch) extends PyTorch with up-to-date features optimizations for an extra performance boost on Intel hardware.
diff --git a/docs/source/getting_started/installation/gpu/cuda.inc.md b/docs/source/getting_started/installation/gpu/cuda.inc.md
index 4cce65278c069..5c2ea30dbfde1 100644
--- a/docs/source/getting_started/installation/gpu/cuda.inc.md
+++ b/docs/source/getting_started/installation/gpu/cuda.inc.md
@@ -10,9 +10,9 @@ vLLM contains pre-compiled C++ and CUDA (12.1) binaries.
 
 ### Create a new Python environment
 
-```{note}
+:::{note}
 PyTorch installed via `conda` will statically link `NCCL` library, which can cause issues when vLLM tries to use `NCCL`. See <gh-issue:8420> for more details.
-```
+:::
 
 In order to be performant, vLLM has to compile many cuda kernels. The compilation unfortunately introduces binary incompatibility with other CUDA versions and PyTorch versions, even for the same PyTorch version with different building configurations.
 
@@ -100,10 +100,10 @@ pip install --editable .
 
 You can find more information about vLLM's wheels in <project:#install-the-latest-code>.
 
-```{note}
+:::{note}
 There is a possibility that your source code may have a different commit ID compared to the latest vLLM wheel, which could potentially lead to unknown errors.
 It is recommended to use the same commit ID for the source code as the vLLM wheel you have installed. Please refer to <project:#install-the-latest-code> for instructions on how to install a specified wheel.
-```
+:::
 
 #### Full build (with compilation)
 
@@ -115,7 +115,7 @@ cd vllm
 pip install -e .
 ```
 
-```{tip}
+:::{tip}
 Building from source requires a lot of compilation. If you are building from source repeatedly, it's more efficient to cache the compilation results.
 
 For example, you can install [ccache](https://github.com/ccache/ccache) using `conda install ccache` or `apt install ccache` .
@@ -123,7 +123,7 @@ As long as `which ccache` command can find the `ccache` binary, it will be used
 
 [sccache](https://github.com/mozilla/sccache) works similarly to `ccache`, but has the capability to utilize caching in remote storage environments.
 The following environment variables can be set to configure the vLLM `sccache` remote: `SCCACHE_BUCKET=vllm-build-sccache SCCACHE_REGION=us-west-2 SCCACHE_S3_NO_CREDENTIALS=1`. We also recommend setting `SCCACHE_IDLE_TIMEOUT=0`.
-```
+:::
 
 ##### Use an existing PyTorch installation
 
diff --git a/docs/source/getting_started/installation/gpu/index.md b/docs/source/getting_started/installation/gpu/index.md
index 6c007382b2c3d..f82c4bda28620 100644
--- a/docs/source/getting_started/installation/gpu/index.md
+++ b/docs/source/getting_started/installation/gpu/index.md
@@ -2,299 +2,300 @@
 
 vLLM is a Python library that supports the following GPU variants. Select your GPU type to see vendor specific instructions:
 
-::::{tab-set}
+:::::{tab-set}
 :sync-group: device
 
-:::{tab-item} CUDA
+::::{tab-item} NVIDIA CUDA
+:selected:
 :sync: cuda
 
-```{include} cuda.inc.md
+:::{include} cuda.inc.md
 :start-after: "# Installation"
 :end-before: "## Requirements"
-```
-
-:::
-
-:::{tab-item} ROCm
-:sync: rocm
-
-```{include} rocm.inc.md
-:start-after: "# Installation"
-:end-before: "## Requirements"
-```
-
-:::
-
-:::{tab-item} XPU
-:sync: xpu
-
-```{include} xpu.inc.md
-:start-after: "# Installation"
-:end-before: "## Requirements"
-```
-
 :::
 
 ::::
 
+::::{tab-item} AMD ROCm
+:sync: rocm
+
+:::{include} rocm.inc.md
+:start-after: "# Installation"
+:end-before: "## Requirements"
+:::
+
+::::
+
+::::{tab-item} Intel XPU
+:sync: xpu
+
+:::{include} xpu.inc.md
+:start-after: "# Installation"
+:end-before: "## Requirements"
+:::
+
+::::
+
+:::::
+
 ## Requirements
 
 - OS: Linux
 - Python: 3.9 -- 3.12
 
-::::{tab-set}
+:::::{tab-set}
 :sync-group: device
 
-:::{tab-item} CUDA
+::::{tab-item} NVIDIA CUDA
 :sync: cuda
 
-```{include} cuda.inc.md
+:::{include} cuda.inc.md
 :start-after: "## Requirements"
 :end-before: "## Set up using Python"
-```
-
-:::
-
-:::{tab-item} ROCm
-:sync: rocm
-
-```{include} rocm.inc.md
-:start-after: "## Requirements"
-:end-before: "## Set up using Python"
-```
-
-:::
-
-:::{tab-item} XPU
-:sync: xpu
-
-```{include} xpu.inc.md
-:start-after: "## Requirements"
-:end-before: "## Set up using Python"
-```
-
 :::
 
 ::::
 
+::::{tab-item} AMD ROCm
+:sync: rocm
+
+:::{include} rocm.inc.md
+:start-after: "## Requirements"
+:end-before: "## Set up using Python"
+:::
+
+::::
+
+::::{tab-item} Intel XPU
+:sync: xpu
+
+:::{include} xpu.inc.md
+:start-after: "## Requirements"
+:end-before: "## Set up using Python"
+:::
+
+::::
+
+:::::
+
 ## Set up using Python
 
 ### Create a new Python environment
 
-```{include} ../python_env_setup.inc.md
-```
-
-::::{tab-set}
-:sync-group: device
-
-:::{tab-item} CUDA
-:sync: cuda
-
-```{include} cuda.inc.md
-:start-after: "## Create a new Python environment"
-:end-before: "### Pre-built wheels"
-```
-
+:::{include} ../python_env_setup.inc.md
 :::
 
-:::{tab-item} ROCm
+:::::{tab-set}
+:sync-group: device
+
+::::{tab-item} NVIDIA CUDA
+:sync: cuda
+
+:::{include} cuda.inc.md
+:start-after: "## Create a new Python environment"
+:end-before: "### Pre-built wheels"
+:::
+
+::::
+
+::::{tab-item} AMD ROCm
 :sync: rocm
 
 There is no extra information on creating a new Python environment for this device.
 
-:::
+::::
 
-:::{tab-item} XPU
+::::{tab-item} Intel XPU
 :sync: xpu
 
 There is no extra information on creating a new Python environment for this device.
 
-:::
-
 ::::
 
+:::::
+
 ### Pre-built wheels
 
-::::{tab-set}
+:::::{tab-set}
 :sync-group: device
 
-:::{tab-item} CUDA
+::::{tab-item} NVIDIA CUDA
 :sync: cuda
 
-```{include} cuda.inc.md
+:::{include} cuda.inc.md
 :start-after: "### Pre-built wheels"
 :end-before: "### Build wheel from source"
-```
-
-:::
-
-:::{tab-item} ROCm
-:sync: rocm
-
-```{include} rocm.inc.md
-:start-after: "### Pre-built wheels"
-:end-before: "### Build wheel from source"
-```
-
-:::
-
-:::{tab-item} XPU
-:sync: xpu
-
-```{include} xpu.inc.md
-:start-after: "### Pre-built wheels"
-:end-before: "### Build wheel from source"
-```
-
 :::
 
 ::::
 
+::::{tab-item} AMD ROCm
+:sync: rocm
+
+:::{include} rocm.inc.md
+:start-after: "### Pre-built wheels"
+:end-before: "### Build wheel from source"
+:::
+
+::::
+
+::::{tab-item} Intel XPU
+:sync: xpu
+
+:::{include} xpu.inc.md
+:start-after: "### Pre-built wheels"
+:end-before: "### Build wheel from source"
+:::
+
+::::
+
+:::::
+
 (build-from-source)=
 
 ### Build wheel from source
 
-::::{tab-set}
+:::::{tab-set}
 :sync-group: device
 
-:::{tab-item} CUDA
+::::{tab-item} NVIDIA CUDA
 :sync: cuda
 
-```{include} cuda.inc.md
+:::{include} cuda.inc.md
 :start-after: "### Build wheel from source"
 :end-before: "## Set up using Docker"
-```
-
-:::
-
-:::{tab-item} ROCm
-:sync: rocm
-
-```{include} rocm.inc.md
-:start-after: "### Build wheel from source"
-:end-before: "## Set up using Docker"
-```
-
-:::
-
-:::{tab-item} XPU
-:sync: xpu
-
-```{include} xpu.inc.md
-:start-after: "### Build wheel from source"
-:end-before: "## Set up using Docker"
-```
-
 :::
 
 ::::
 
+::::{tab-item} AMD ROCm
+:sync: rocm
+
+:::{include} rocm.inc.md
+:start-after: "### Build wheel from source"
+:end-before: "## Set up using Docker"
+:::
+
+::::
+
+::::{tab-item} Intel XPU
+:sync: xpu
+
+:::{include} xpu.inc.md
+:start-after: "### Build wheel from source"
+:end-before: "## Set up using Docker"
+:::
+
+::::
+
+:::::
+
 ## Set up using Docker
 
 ### Pre-built images
 
-::::{tab-set}
+:::::{tab-set}
 :sync-group: device
 
-:::{tab-item} CUDA
+::::{tab-item} NVIDIA CUDA
 :sync: cuda
 
-```{include} cuda.inc.md
+:::{include} cuda.inc.md
 :start-after: "### Pre-built images"
 :end-before: "### Build image from source"
-```
-
-:::
-
-:::{tab-item} ROCm
-:sync: rocm
-
-```{include} rocm.inc.md
-:start-after: "### Pre-built images"
-:end-before: "### Build image from source"
-```
-
-:::
-
-:::{tab-item} XPU
-:sync: xpu
-
-```{include} xpu.inc.md
-:start-after: "### Pre-built images"
-:end-before: "### Build image from source"
-```
-
 :::
 
 ::::
 
+::::{tab-item} AMD ROCm
+:sync: rocm
+
+:::{include} rocm.inc.md
+:start-after: "### Pre-built images"
+:end-before: "### Build image from source"
+:::
+
+::::
+
+::::{tab-item} Intel XPU
+:sync: xpu
+
+:::{include} xpu.inc.md
+:start-after: "### Pre-built images"
+:end-before: "### Build image from source"
+:::
+
+::::
+
+:::::
+
 ### Build image from source
 
-::::{tab-set}
+:::::{tab-set}
 :sync-group: device
 
-:::{tab-item} CUDA
+::::{tab-item} NVIDIA CUDA
 :sync: cuda
 
-```{include} cuda.inc.md
+:::{include} cuda.inc.md
 :start-after: "### Build image from source"
 :end-before: "## Supported features"
-```
-
-:::
-
-:::{tab-item} ROCm
-:sync: rocm
-
-```{include} rocm.inc.md
-:start-after: "### Build image from source"
-:end-before: "## Supported features"
-```
-
-:::
-
-:::{tab-item} XPU
-:sync: xpu
-
-```{include} xpu.inc.md
-:start-after: "### Build image from source"
-:end-before: "## Supported features"
-```
-
 :::
 
 ::::
 
+::::{tab-item} AMD ROCm
+:sync: rocm
+
+:::{include} rocm.inc.md
+:start-after: "### Build image from source"
+:end-before: "## Supported features"
+:::
+
+::::
+
+::::{tab-item} Intel XPU
+:sync: xpu
+
+:::{include} xpu.inc.md
+:start-after: "### Build image from source"
+:end-before: "## Supported features"
+:::
+
+::::
+
+:::::
+
 ## Supported features
 
-::::{tab-set}
+:::::{tab-set}
 :sync-group: device
 
-:::{tab-item} CUDA
+::::{tab-item} NVIDIA CUDA
 :sync: cuda
 
-```{include} cuda.inc.md
+:::{include} cuda.inc.md
 :start-after: "## Supported features"
-```
-
-:::
-
-:::{tab-item} ROCm
-:sync: rocm
-
-```{include} rocm.inc.md
-:start-after: "## Supported features"
-```
-
-:::
-
-:::{tab-item} XPU
-:sync: xpu
-
-```{include} xpu.inc.md
-:start-after: "## Supported features"
-```
-
 :::
 
 ::::
+
+::::{tab-item} AMD ROCm
+:sync: rocm
+
+:::{include} rocm.inc.md
+:start-after: "## Supported features"
+:::
+
+::::
+
+::::{tab-item} Intel XPU
+:sync: xpu
+
+:::{include} xpu.inc.md
+:start-after: "## Supported features"
+:::
+
+::::
+
+:::::
diff --git a/docs/source/getting_started/installation/gpu/rocm.inc.md b/docs/source/getting_started/installation/gpu/rocm.inc.md
index 69238f6e36fb2..c8fd11415cfda 100644
--- a/docs/source/getting_started/installation/gpu/rocm.inc.md
+++ b/docs/source/getting_started/installation/gpu/rocm.inc.md
@@ -2,6 +2,10 @@
 
 vLLM supports AMD GPUs with ROCm 6.2.
 
+:::{attention}
+There are no pre-built wheels for this device, so you must either use the pre-built Docker image or build vLLM from source.
+:::
+
 ## Requirements
 
 - GPU: MI200s (gfx90a), MI300 (gfx942), Radeon RX 7900 series (gfx1100)
@@ -13,14 +17,6 @@ vLLM supports AMD GPUs with ROCm 6.2.
 
 Currently, there are no pre-built ROCm wheels.
 
-However, the [AMD Infinity hub for vLLM](https://hub.docker.com/r/rocm/vllm/tags) offers a prebuilt, optimized
-docker image designed for validating inference performance on the AMD Instinct™ MI300X accelerator.
-
-```{tip}
-Please check [LLM inference performance validation on AMD Instinct MI300X](https://rocm.docs.amd.com/en/latest/how-to/performance-validation/mi300x/vllm-benchmark.html)
-for instructions on how to use this prebuilt docker image.
-```
-
 ### Build wheel from source
 
 0. Install prerequisites (skip if you are already in an environment/docker with the following installed):
@@ -47,9 +43,9 @@ for instructions on how to use this prebuilt docker image.
     cd ../..
     ```
 
-    ```{note}
-    - If you see HTTP issue related to downloading packages during building triton, please try again as the HTTP error is intermittent.
-    ```
+    :::{note}
+    If you see HTTP issue related to downloading packages during building triton, please try again as the HTTP error is intermittent.
+    :::
 
 2. Optionally, if you choose to use CK flash attention, you can install [flash attention for ROCm](https://github.com/ROCm/flash-attention/tree/ck_tile)
 
@@ -67,9 +63,9 @@ for instructions on how to use this prebuilt docker image.
     cd ..
     ```
 
-    ```{note}
-    - You might need to downgrade the "ninja" version to 1.10 it is not used when compiling flash-attention-2 (e.g. `pip install ninja==1.10.2.4`)
-    ```
+    :::{note}
+    You might need to downgrade the "ninja" version to 1.10 it is not used when compiling flash-attention-2 (e.g. `pip install ninja==1.10.2.4`)
+    :::
 
 3. Build vLLM. For example, vLLM on ROCM 6.2 can be built with the following steps:
 
@@ -95,23 +91,30 @@ for instructions on how to use this prebuilt docker image.
 
     This may take 5-10 minutes. Currently, `pip install .` does not work for ROCm installation.
 
-    ```{tip}
+<!--- pyml disable-num-lines 5 ul-indent-->
+    :::{tip}
     - Triton flash attention is used by default. For benchmarking purposes, it is recommended to run a warm up step before collecting perf numbers.
     - Triton flash attention does not currently support sliding window attention. If using half precision, please use CK flash-attention for sliding window support.
     - To use CK flash-attention or PyTorch naive attention, please use this flag `export VLLM_USE_TRITON_FLASH_ATTN=0` to turn off triton flash attention.
     - The ROCm version of PyTorch, ideally, should match the ROCm driver version.
-    ```
+    :::
 
-```{tip}
+:::{tip}
 - For MI300x (gfx942) users, to achieve optimal performance, please refer to [MI300x tuning guide](https://rocm.docs.amd.com/en/latest/how-to/tuning-guides/mi300x/index.html) for performance optimization and tuning tips on system and workflow level.
   For vLLM, please refer to [vLLM performance optimization](https://rocm.docs.amd.com/en/latest/how-to/tuning-guides/mi300x/workload.html#vllm-performance-optimization).
-```
+:::
 
 ## Set up using Docker
 
 ### Pre-built images
 
-Currently, there are no pre-built ROCm images.
+The [AMD Infinity hub for vLLM](https://hub.docker.com/r/rocm/vllm/tags) offers a prebuilt, optimized
+docker image designed for validating inference performance on the AMD Instinct™ MI300X accelerator.
+
+:::{tip}
+Please check [LLM inference performance validation on AMD Instinct MI300X](https://rocm.docs.amd.com/en/latest/how-to/performance-validation/mi300x/vllm-benchmark.html)
+for instructions on how to use this prebuilt docker image.
+:::
 
 ### Build image from source
 
diff --git a/docs/source/getting_started/installation/gpu/xpu.inc.md b/docs/source/getting_started/installation/gpu/xpu.inc.md
index 577986eba74fd..ef02d9a078a1b 100644
--- a/docs/source/getting_started/installation/gpu/xpu.inc.md
+++ b/docs/source/getting_started/installation/gpu/xpu.inc.md
@@ -2,6 +2,10 @@
 
 vLLM initially supports basic model inferencing and serving on Intel GPU platform.
 
+:::{attention}
+There are no pre-built wheels or images for this device, so you must build vLLM from source.
+:::
+
 ## Requirements
 
 - Supported Hardware: Intel Data Center GPU, Intel ARC GPU
@@ -30,10 +34,10 @@ pip install -v -r requirements-xpu.txt
 VLLM_TARGET_DEVICE=xpu python setup.py install
 ```
 
-```{note}
+:::{note}
 - FP16 is the default data type in the current XPU backend. The BF16 data
-  type will be supported in the future.
-```
+  type is supported on Intel Data Center GPU, not supported on Intel Arc GPU yet.
+:::
 
 ## Set up using Docker
 
diff --git a/docs/source/getting_started/installation/index.md b/docs/source/getting_started/installation/index.md
index bc1d268bf0c7e..c64c3a7208eeb 100644
--- a/docs/source/getting_started/installation/index.md
+++ b/docs/source/getting_started/installation/index.md
@@ -4,10 +4,25 @@
 
 vLLM supports the following hardware platforms:
 
-```{toctree}
+:::{toctree}
 :maxdepth: 1
+:hidden:
 
 gpu/index
 cpu/index
 ai_accelerator/index
-```
+:::
+
+- <project:gpu/index.md>
+  - NVIDIA CUDA
+  - AMD ROCm
+  - Intel XPU
+- <project:cpu/index.md>
+  - Intel/AMD x86
+  - ARM AArch64
+  - Apple silicon
+- <project:ai_accelerator/index.md>
+  - Google TPU
+  - Intel Gaudi
+  - AWS Neuron
+  - OpenVINO
diff --git a/docs/source/getting_started/installation/python_env_setup.inc.md b/docs/source/getting_started/installation/python_env_setup.inc.md
index 25cfac5f58aa7..cb73914c9c75e 100644
--- a/docs/source/getting_started/installation/python_env_setup.inc.md
+++ b/docs/source/getting_started/installation/python_env_setup.inc.md
@@ -6,9 +6,9 @@ conda create -n myenv python=3.12 -y
 conda activate myenv
 ```
 
-```{note}
+:::{note}
 [PyTorch has deprecated the conda release channel](https://github.com/pytorch/pytorch/issues/138506). If you use `conda`, please only use it to create Python environment rather than installing packages.
-```
+:::
 
 Or you can create a new Python environment using [uv](https://docs.astral.sh/uv/), a very fast Python environment manager. Please follow the [documentation](https://docs.astral.sh/uv/#getting-started) to install `uv`. After installing `uv`, you can create a new Python environment using the following command:
 
diff --git a/docs/source/getting_started/quickstart.md b/docs/source/getting_started/quickstart.md
index 8ac80e5e5c553..f4682ee45a48e 100644
--- a/docs/source/getting_started/quickstart.md
+++ b/docs/source/getting_started/quickstart.md
@@ -32,9 +32,9 @@ conda activate myenv
 pip install vllm
 ```
 
-```{note}
+:::{note}
 For non-CUDA platforms, please refer [here](#installation-index) for specific instructions on how to install vLLM.
-```
+:::
 
 (quickstart-offline)=
 
@@ -69,9 +69,9 @@ The {class}`~vllm.LLM` class initializes vLLM's engine and the [OPT-125M model](
 llm = LLM(model="facebook/opt-125m")
 ```
 
-```{note}
+:::{note}
 By default, vLLM downloads models from [HuggingFace](https://huggingface.co/). If you would like to use models from [ModelScope](https://www.modelscope.cn), set the environment variable `VLLM_USE_MODELSCOPE` before initializing the engine.
-```
+:::
 
 Now, the fun part! The outputs are generated using `llm.generate`. It adds the input prompts to the vLLM engine's waiting queue and executes the vLLM engine to generate the outputs with high throughput. The outputs are returned as a list of `RequestOutput` objects, which include all of the output tokens.
 
@@ -97,10 +97,10 @@ Run the following command to start the vLLM server with the [Qwen2.5-1.5B-Instru
 vllm serve Qwen/Qwen2.5-1.5B-Instruct
 ```
 
-```{note}
+:::{note}
 By default, the server uses a predefined chat template stored in the tokenizer.
 You can learn about overriding it [here](#chat-template).
-```
+:::
 
 This server can be queried in the same format as OpenAI API. For example, to list the models:
 
diff --git a/docs/source/getting_started/troubleshooting.md b/docs/source/getting_started/troubleshooting.md
index 7bfe9b4036adf..2f41fa3b6b19e 100644
--- a/docs/source/getting_started/troubleshooting.md
+++ b/docs/source/getting_started/troubleshooting.md
@@ -4,9 +4,9 @@
 
 This document outlines some troubleshooting strategies you can consider. If you think you've discovered a bug, please [search existing issues](https://github.com/vllm-project/vllm/issues?q=is%3Aissue) first to see if it has already been reported. If not, please [file a new issue](https://github.com/vllm-project/vllm/issues/new/choose), providing as much relevant information as possible.
 
-```{note}
+:::{note}
 Once you've debugged a problem, remember to turn off any debugging environment variables defined, or simply start a new shell to avoid being affected by lingering debugging settings. Otherwise, the system might be slow with debugging functionalities left activated.
-```
+:::
 
 ## Hangs downloading a model
 
@@ -18,9 +18,9 @@ It's recommended to download the model first using the [huggingface-cli](https:/
 If the model is large, it can take a long time to load it from disk. Pay attention to where you store the model. Some clusters have shared filesystems across nodes, e.g. a distributed filesystem or a network filesystem, which can be slow.
 It'd be better to store the model in a local disk. Additionally, have a look at the CPU memory usage, when the model is too large it might take a lot of CPU memory, slowing down the operating system because it needs to frequently swap between disk and memory.
 
-```{note}
+:::{note}
 To isolate the model downloading and loading issue, you can use the `--load-format dummy` argument to skip loading the model weights. This way, you can check if the model downloading and loading is the bottleneck.
-```
+:::
 
 ## Out of memory
 
@@ -132,14 +132,14 @@ If the script runs successfully, you should see the message `sanity check is suc
 
 If the test script hangs or crashes, usually it means the hardware/drivers are broken in some sense. You should try to contact your system administrator or hardware vendor for further assistance. As a common workaround, you can try to tune some NCCL environment variables, such as `export NCCL_P2P_DISABLE=1` to see if it helps. Please check [their documentation](https://docs.nvidia.com/deeplearning/nccl/user-guide/docs/env.html) for more information. Please only use these environment variables as a temporary workaround, as they might affect the performance of the system. The best solution is still to fix the hardware/drivers so that the test script can run successfully.
 
-```{note}
+:::{note}
 A multi-node environment is more complicated than a single-node one. If you see errors such as `torch.distributed.DistNetworkError`, it is likely that the network/DNS setup is incorrect. In that case, you can manually assign node rank and specify the IP via command line arguments:
 
 - In the first node, run `NCCL_DEBUG=TRACE torchrun --nnodes 2 --nproc-per-node=2 --node-rank 0 --master_addr $MASTER_ADDR test.py`.
 - In the second node, run `NCCL_DEBUG=TRACE torchrun --nnodes 2 --nproc-per-node=2 --node-rank 1 --master_addr $MASTER_ADDR test.py`.
 
 Adjust `--nproc-per-node`, `--nnodes`, and `--node-rank` according to your setup, being sure to execute different commands (with different `--node-rank`) on different nodes.
-```
+:::
 
 (troubleshooting-python-multiprocessing)=
 
diff --git a/docs/source/index.md b/docs/source/index.md
index d7a1117df9c27..ee25678e2c418 100644
--- a/docs/source/index.md
+++ b/docs/source/index.md
@@ -1,13 +1,13 @@
 # Welcome to vLLM
 
-```{figure} ./assets/logos/vllm-logo-text-light.png
+:::{figure} ./assets/logos/vllm-logo-text-light.png
 :align: center
 :alt: vLLM
 :class: no-scaled-link
 :width: 60%
-```
+:::
 
-```{raw} html
+:::{raw} html
 <p style="text-align:center">
 <strong>Easy, fast, and cheap LLM serving for everyone
 </strong>
@@ -19,7 +19,7 @@
 <a class="github-button" href="https://github.com/vllm-project/vllm/subscription" data-icon="octicon-eye" data-size="large" aria-label="Watch">Watch</a>
 <a class="github-button" href="https://github.com/vllm-project/vllm/fork" data-icon="octicon-repo-forked" data-size="large" aria-label="Fork">Fork</a>
 </p>
-```
+:::
 
 vLLM is a fast and easy-to-use library for LLM inference and serving.
 
@@ -58,7 +58,7 @@ For more information, check out the following:
 
 % How to start using vLLM?
 
-```{toctree}
+:::{toctree}
 :caption: Getting Started
 :maxdepth: 1
 
@@ -67,11 +67,11 @@ getting_started/quickstart
 getting_started/examples/examples_index
 getting_started/troubleshooting
 getting_started/faq
-```
+:::
 
 % What does vLLM support?
 
-```{toctree}
+:::{toctree}
 :caption: Models
 :maxdepth: 1
 
@@ -79,27 +79,28 @@ models/generative_models
 models/pooling_models
 models/supported_models
 models/extensions/index
-```
+:::
 
 % Additional capabilities
 
-```{toctree}
+:::{toctree}
 :caption: Features
 :maxdepth: 1
 
 features/quantization/index
 features/lora
 features/tool_calling
+features/reasoning_outputs
 features/structured_outputs
 features/automatic_prefix_caching
 features/disagg_prefill
 features/spec_decode
 features/compatibility_matrix
-```
+:::
 
 % Details about running vLLM
 
-```{toctree}
+:::{toctree}
 :caption: Inference and Serving
 :maxdepth: 1
 
@@ -112,11 +113,11 @@ serving/engine_args
 serving/env_vars
 serving/usage_stats
 serving/integrations/index
-```
+:::
 
 % Scaling up vLLM for production
 
-```{toctree}
+:::{toctree}
 :caption: Deployment
 :maxdepth: 1
 
@@ -125,21 +126,21 @@ deployment/k8s
 deployment/nginx
 deployment/frameworks/index
 deployment/integrations/index
-```
+:::
 
 % Making the most out of vLLM
 
-```{toctree}
+:::{toctree}
 :caption: Performance
 :maxdepth: 1
 
 performance/optimization
 performance/benchmarks
-```
+:::
 
 % Explanation of vLLM internals
 
-```{toctree}
+:::{toctree}
 :caption: Design Documents
 :maxdepth: 2
 
@@ -150,11 +151,18 @@ design/kernel/paged_attention
 design/mm_processing
 design/automatic_prefix_caching
 design/multiprocessing
-```
+:::
+
+:::{toctree}
+:caption: V1 Design Documents
+:maxdepth: 2
+
+design/v1/prefix_caching
+:::
 
 % How to contribute to the vLLM project
 
-```{toctree}
+:::{toctree}
 :caption: Developer Guide
 :maxdepth: 2
 
@@ -163,11 +171,11 @@ contributing/profiling/profiling_index
 contributing/dockerfile/dockerfile
 contributing/model/index
 contributing/vulnerability_management
-```
+:::
 
 % Technical API specifications
 
-```{toctree}
+:::{toctree}
 :caption: API Reference
 :maxdepth: 2
 
@@ -176,17 +184,18 @@ api/engine/index
 api/inference_params
 api/multimodal/index
 api/model/index
-```
+:::
 
 % Latest news and acknowledgements
 
-```{toctree}
+:::{toctree}
 :caption: Community
 :maxdepth: 1
 
+community/blog
 community/meetups
 community/sponsors
-```
+:::
 
 ## Indices and tables
 
diff --git a/docs/source/models/extensions/index.md b/docs/source/models/extensions/index.md
index cff09d12eba47..69faf472e5300 100644
--- a/docs/source/models/extensions/index.md
+++ b/docs/source/models/extensions/index.md
@@ -1,8 +1,8 @@
 # Built-in Extensions
 
-```{toctree}
+:::{toctree}
 :maxdepth: 1
 
 runai_model_streamer
 tensorizer
-```
+:::
diff --git a/docs/source/models/extensions/runai_model_streamer.md b/docs/source/models/extensions/runai_model_streamer.md
index 75f7a9fcad416..99c37876a01b3 100644
--- a/docs/source/models/extensions/runai_model_streamer.md
+++ b/docs/source/models/extensions/runai_model_streamer.md
@@ -48,6 +48,6 @@ You can read further about CPU buffer memory limiting [here](https://github.com/
 vllm serve /home/meta-llama/Llama-3.2-3B-Instruct --load-format runai_streamer --model-loader-extra-config '{"memory_limit":5368709120}'
 ```
 
-```{note}
+:::{note}
 For further instructions about tunable parameters and additional parameters configurable through environment variables, read the [Environment Variables Documentation](https://github.com/run-ai/runai-model-streamer/blob/master/docs/src/env-vars.md).
-```
+:::
diff --git a/docs/source/models/extensions/tensorizer.md b/docs/source/models/extensions/tensorizer.md
index ae17e3437bca6..830c579d91bae 100644
--- a/docs/source/models/extensions/tensorizer.md
+++ b/docs/source/models/extensions/tensorizer.md
@@ -11,6 +11,6 @@ For more information on CoreWeave's Tensorizer, please refer to
 [CoreWeave's Tensorizer documentation](https://github.com/coreweave/tensorizer). For more information on serializing a vLLM model, as well a general usage guide to using Tensorizer with vLLM, see
 the [vLLM example script](https://docs.vllm.ai/en/stable/getting_started/examples/offline_inference/tensorize_vllm_model.html).
 
-```{note}
+:::{note}
 Note that to use this feature you will need to install `tensorizer` by running `pip install vllm[tensorizer]`.
-```
+:::
diff --git a/docs/source/models/generative_models.md b/docs/source/models/generative_models.md
index e4b4cd03a90d2..4abe6b776eea3 100644
--- a/docs/source/models/generative_models.md
+++ b/docs/source/models/generative_models.md
@@ -70,10 +70,10 @@ The {class}`~vllm.LLM.chat` method implements chat functionality on top of {clas
 In particular, it accepts input similar to [OpenAI Chat Completions API](https://platform.openai.com/docs/api-reference/chat)
 and automatically applies the model's [chat template](https://huggingface.co/docs/transformers/en/chat_templating) to format the prompt.
 
-```{important}
+:::{important}
 In general, only instruction-tuned models have a chat template.
 Base models may perform poorly as they are not trained to respond to the chat conversation.
-```
+:::
 
 ```python
 llm = LLM(model="meta-llama/Meta-Llama-3-8B-Instruct")
diff --git a/docs/source/models/pooling_models.md b/docs/source/models/pooling_models.md
index 91db694be29a4..9704ccee745c4 100644
--- a/docs/source/models/pooling_models.md
+++ b/docs/source/models/pooling_models.md
@@ -8,54 +8,54 @@ In vLLM, pooling models implement the {class}`~vllm.model_executor.models.VllmMo
 These models use a {class}`~vllm.model_executor.layers.Pooler` to extract the final hidden states of the input
 before returning them.
 
-```{note}
+:::{note}
 We currently support pooling models primarily as a matter of convenience.
 As shown in the [Compatibility Matrix](#compatibility-matrix), most vLLM features are not applicable to
 pooling models as they only work on the generation or decode stage, so performance may not improve as much.
-```
+:::
 
 For pooling models, we support the following `--task` options.
 The selected option sets the default pooler used to extract the final hidden states:
 
-```{list-table}
+:::{list-table}
 :widths: 50 25 25 25
 :header-rows: 1
 
-* - Task
-  - Pooling Type
-  - Normalization
-  - Softmax
-* - Embedding (`embed`)
-  - `LAST`
-  - ✅︎
-  - ✗
-* - Classification (`classify`)
-  - `LAST`
-  - ✗
-  - ✅︎
-* - Sentence Pair Scoring (`score`)
-  - \*
-  - \*
-  - \*
-* - Reward Modeling (`reward`)
-  - `ALL`
-  - ✗
-  - ✗
-```
+- * Task
+  * Pooling Type
+  * Normalization
+  * Softmax
+- * Embedding (`embed`)
+  * `LAST`
+  * ✅︎
+  * ✗
+- * Classification (`classify`)
+  * `LAST`
+  * ✗
+  * ✅︎
+- * Sentence Pair Scoring (`score`)
+  * \*
+  * \*
+  * \*
+- * Reward Modeling (`reward`)
+  * `ALL`
+  * ✗
+  * ✗
+:::
 
 \*The default pooler is always defined by the model.
 
-```{note}
+:::{note}
 If the model's implementation in vLLM defines its own pooler, the default pooler is set to that instead of the one specified in this table.
-```
+:::
 
 When loading [Sentence Transformers](https://huggingface.co/sentence-transformers) models,
 we attempt to override the default pooler based on its Sentence Transformers configuration file (`modules.json`).
 
-```{tip}
+:::{tip}
 You can customize the model's pooling method via the `--override-pooler-config` option,
 which takes priority over both the model's and Sentence Transformers's defaults.
-```
+:::
 
 ## Offline Inference
 
@@ -111,10 +111,10 @@ The {class}`~vllm.LLM.score` method outputs similarity scores between sentence p
 It is primarily designed for [cross-encoder models](https://www.sbert.net/examples/applications/cross-encoder/README.html).
 These types of models serve as rerankers between candidate query-document pairs in RAG systems.
 
-```{note}
+:::{note}
 vLLM can only perform the model inference component (e.g. embedding, reranking) of RAG.
 To handle RAG at a higher level, you should use integration frameworks such as [LangChain](https://github.com/langchain-ai/langchain).
-```
+:::
 
 ```python
 llm = LLM(model="BAAI/bge-reranker-v2-m3", task="score")
diff --git a/docs/source/models/supported_models.md b/docs/source/models/supported_models.md
index 8cdc663a0320f..4a099646964f2 100644
--- a/docs/source/models/supported_models.md
+++ b/docs/source/models/supported_models.md
@@ -17,7 +17,7 @@ By default, vLLM loads models from [HuggingFace (HF) Hub](https://huggingface.co
 To determine whether a given model is supported, you can check the `config.json` file inside the HF repository.
 If the `"architectures"` field contains a model architecture listed below, then it should be supported in theory.
 
-````{tip}
+:::{tip}
 The easiest way to check if your model is really supported at runtime is to run the program below:
 
 ```python
@@ -35,11 +35,87 @@ print(output)
 ```
 
 If vLLM successfully returns text (for generative models) or hidden states (for pooling models), it indicates that your model is supported.
-````
+:::
 
 Otherwise, please refer to [Adding a New Model](#new-model) for instructions on how to implement your model in vLLM.
 Alternatively, you can [open an issue on GitHub](https://github.com/vllm-project/vllm/issues/new/choose) to request vLLM support.
 
+### Transformers fallback
+
+After the merge of <gh-pr:11330>, `vllm` can fallback to models that are available in `transformers`. This does not work for all models for now, but most decoder language models are supported, and vision language model support is planned!
+
+To check if the backend is `transformers`, you can simply do this:
+
+```python 
+from vllm import LLM
+llm = LLM(model=..., task="generate")  # Name or path of your model
+llm.apply_model(lambda model: print(model.__class__))
+```
+
+If it is `TransformersModel` then it means it's based on `transformers`!
+
+#### Supported features
+
+##### LORA and quantization
+
+Both are not supported yet! Make sure to open an issue and we'll work on this together with the `transformers` team!
+
+Usually `transformers` model load weights via the `load_adapters` API, that depends on PEFT. We need to work a bit to either use this api (for now this would result in some weights not being marked as loaded) or replace modules accordingly.
+
+Hints as to how this would look like:
+
+```python
+class TransformersModel(nn.Module, SupportsLoRA):
+  def __init__(*):
+    ...
+    self.model.load_adapter(vllm_config.load_config.model_loader_extra_config["qlora_adapter_name_or_path"])
+```
+
+Blocker is that you need to specify supported lora layers, when we would ideally want to load whatever is inside the checkpoint!
+
+##### Remote code
+
+This fallback also means that any model on the hub that can be used in `transformers` with `trust_remote_code=True` that correctly implements attention can be used in production!
+
+```python 
+from vllm import LLM
+llm = LLM(model=..., task="generate", trust_remote_code=True)  # Name or path of your model
+llm.apply_model(lambda model: print(model.__class__))
+```
+
+A model just needs the following two things:
+
+```python
+from transformers import PreTrainedModel
+from torch import nn
+
+class MyAttention(nn.Module):
+
+  def forward(self, hidden_states, **kwargs): # <- kwargs are required
+
+    ...
+    attention_interface = attention_interface = ALL_ATTENTION_FUNCTIONS[self.config._attn_implementation]
+    attn_output, attn_weights = attention_interface(
+      self,
+      query_states,
+      key_states,
+      value_states,
+      **kwargs,
+    )
+    ...
+
+class MyModel(PreTrainedModel):
+  _supports_attention_backend = True
+```
+
+Here is what happens in the background:
+
+1. The config is loaded
+2. `MyModel` python class is loaded from the `auto_map`, and we check that the model `_supports_attention_backend`.
+3. The `TransformersModel` backend is used. See `/model_executors/models/transformers`, which leverage `self.config._attn_implementation = "vllm"`, thus the need to use `ALL_ATTENTION_FUNCTION`.
+
+That's it!
+
 ### ModelScope
 
 To use models from [ModelScope](https://www.modelscope.cn) instead of HuggingFace Hub, set an environment variable:
@@ -72,364 +148,364 @@ See [this page](#generative-models) for more information on how to use generativ
 
 #### Text Generation (`--task generate`)
 
-```{list-table}
+:::{list-table}
 :widths: 25 25 50 5 5
 :header-rows: 1
 
-* - Architecture
-  - Models
-  - Example HF Models
-  - [LoRA](#lora-adapter)
-  - [PP](#distributed-serving)
-* - `AquilaForCausalLM`
-  - Aquila, Aquila2
-  - `BAAI/Aquila-7B`, `BAAI/AquilaChat-7B`, etc.
-  - ✅︎
-  - ✅︎
-* - `ArcticForCausalLM`
-  - Arctic
-  - `Snowflake/snowflake-arctic-base`, `Snowflake/snowflake-arctic-instruct`, etc.
-  -
-  - ✅︎
-* - `BaiChuanForCausalLM`
-  - Baichuan2, Baichuan
-  - `baichuan-inc/Baichuan2-13B-Chat`, `baichuan-inc/Baichuan-7B`, etc.
-  - ✅︎
-  - ✅︎
-* - `BloomForCausalLM`
-  - BLOOM, BLOOMZ, BLOOMChat
-  - `bigscience/bloom`, `bigscience/bloomz`, etc.
-  -
-  - ✅︎
-* - `BartForConditionalGeneration`
-  - BART
-  - `facebook/bart-base`, `facebook/bart-large-cnn`, etc.
-  -
-  -
-* - `ChatGLMModel`
-  - ChatGLM
-  - `THUDM/chatglm2-6b`, `THUDM/chatglm3-6b`, etc.
-  - ✅︎
-  - ✅︎
-* - `CohereForCausalLM`, `Cohere2ForCausalLM`
-  - Command-R
-  - `CohereForAI/c4ai-command-r-v01`, `CohereForAI/c4ai-command-r7b-12-2024`, etc.
-  - ✅︎
-  - ✅︎
-* - `DbrxForCausalLM`
-  - DBRX
-  - `databricks/dbrx-base`, `databricks/dbrx-instruct`, etc.
-  -
-  - ✅︎
-* - `DeciLMForCausalLM`
-  - DeciLM
-  - `Deci/DeciLM-7B`, `Deci/DeciLM-7B-instruct`, etc.
-  -
-  - ✅︎
-* - `DeepseekForCausalLM`
-  - DeepSeek
-  - `deepseek-ai/deepseek-llm-67b-base`, `deepseek-ai/deepseek-llm-7b-chat` etc.
-  -
-  - ✅︎
-* - `DeepseekV2ForCausalLM`
-  - DeepSeek-V2
-  - `deepseek-ai/DeepSeek-V2`, `deepseek-ai/DeepSeek-V2-Chat` etc.
-  -
-  - ✅︎
-* - `DeepseekV3ForCausalLM`
-  - DeepSeek-V3
-  - `deepseek-ai/DeepSeek-V3-Base`, `deepseek-ai/DeepSeek-V3` etc.
-  -
-  - ✅︎
-* - `ExaoneForCausalLM`
-  - EXAONE-3
-  - `LGAI-EXAONE/EXAONE-3.0-7.8B-Instruct`, etc.
-  - ✅︎
-  - ✅︎
-* - `FalconForCausalLM`
-  - Falcon
-  - `tiiuae/falcon-7b`, `tiiuae/falcon-40b`, `tiiuae/falcon-rw-7b`, etc.
-  -
-  - ✅︎
-* - `FalconMambaForCausalLM`
-  - FalconMamba
-  - `tiiuae/falcon-mamba-7b`, `tiiuae/falcon-mamba-7b-instruct`, etc.
-  - ✅︎
-  - ✅︎
-* - `GemmaForCausalLM`
-  - Gemma
-  - `google/gemma-2b`, `google/gemma-7b`, etc.
-  - ✅︎
-  - ✅︎
-* - `Gemma2ForCausalLM`
-  - Gemma2
-  - `google/gemma-2-9b`, `google/gemma-2-27b`, etc.
-  - ✅︎
-  - ✅︎
-* - `GlmForCausalLM`
-  - GLM-4
-  - `THUDM/glm-4-9b-chat-hf`, etc.
-  - ✅︎
-  - ✅︎
-* - `GPT2LMHeadModel`
-  - GPT-2
-  - `gpt2`, `gpt2-xl`, etc.
-  -
-  - ✅︎
-* - `GPTBigCodeForCausalLM`
-  - StarCoder, SantaCoder, WizardCoder
-  - `bigcode/starcoder`, `bigcode/gpt_bigcode-santacoder`, `WizardLM/WizardCoder-15B-V1.0`, etc.
-  - ✅︎
-  - ✅︎
-* - `GPTJForCausalLM`
-  - GPT-J
-  - `EleutherAI/gpt-j-6b`, `nomic-ai/gpt4all-j`, etc.
-  -
-  - ✅︎
-* - `GPTNeoXForCausalLM`
-  - GPT-NeoX, Pythia, OpenAssistant, Dolly V2, StableLM
-  - `EleutherAI/gpt-neox-20b`, `EleutherAI/pythia-12b`, `OpenAssistant/oasst-sft-4-pythia-12b-epoch-3.5`, `databricks/dolly-v2-12b`, `stabilityai/stablelm-tuned-alpha-7b`, etc.
-  -
-  - ✅︎
-* - `GraniteForCausalLM`
-  - Granite 3.0, Granite 3.1, PowerLM
-  - `ibm-granite/granite-3.0-2b-base`, `ibm-granite/granite-3.1-8b-instruct`, `ibm/PowerLM-3b`, etc.
-  - ✅︎
-  - ✅︎
-* - `GraniteMoeForCausalLM`
-  - Granite 3.0 MoE, PowerMoE
-  - `ibm-granite/granite-3.0-1b-a400m-base`, `ibm-granite/granite-3.0-3b-a800m-instruct`, `ibm/PowerMoE-3b`, etc.
-  - ✅︎
-  - ✅︎
-* - `GritLM`
-  - GritLM
-  - `parasail-ai/GritLM-7B-vllm`.
-  - ✅︎
-  - ✅︎
-* - `InternLMForCausalLM`
-  - InternLM
-  - `internlm/internlm-7b`, `internlm/internlm-chat-7b`, etc.
-  - ✅︎
-  - ✅︎
-* - `InternLM2ForCausalLM`
-  - InternLM2
-  - `internlm/internlm2-7b`, `internlm/internlm2-chat-7b`, etc.
-  - ✅︎
-  - ✅︎
-* - `InternLM3ForCausalLM`
-  - InternLM3
-  - `internlm/internlm3-8b-instruct`, etc.
-  - ✅︎
-  - ✅︎
-* - `JAISLMHeadModel`
-  - Jais
-  - `inceptionai/jais-13b`, `inceptionai/jais-13b-chat`, `inceptionai/jais-30b-v3`, `inceptionai/jais-30b-chat-v3`, etc.
-  -
-  - ✅︎
-* - `JambaForCausalLM`
-  - Jamba
-  - `ai21labs/AI21-Jamba-1.5-Large`, `ai21labs/AI21-Jamba-1.5-Mini`, `ai21labs/Jamba-v0.1`, etc.
-  - ✅︎
-  - ✅︎
-* - `LlamaForCausalLM`
-  - Llama 3.1, Llama 3, Llama 2, LLaMA, Yi
-  - `meta-llama/Meta-Llama-3.1-405B-Instruct`, `meta-llama/Meta-Llama-3.1-70B`, `meta-llama/Meta-Llama-3-70B-Instruct`, `meta-llama/Llama-2-70b-hf`, `01-ai/Yi-34B`, etc.
-  - ✅︎
-  - ✅︎
-* - `MambaForCausalLM`
-  - Mamba
-  - `state-spaces/mamba-130m-hf`, `state-spaces/mamba-790m-hf`, `state-spaces/mamba-2.8b-hf`, etc.
-  -
-  - ✅︎
-* - `MiniCPMForCausalLM`
-  - MiniCPM
-  - `openbmb/MiniCPM-2B-sft-bf16`, `openbmb/MiniCPM-2B-dpo-bf16`, `openbmb/MiniCPM-S-1B-sft`, etc.
-  - ✅︎
-  - ✅︎
-* - `MiniCPM3ForCausalLM`
-  - MiniCPM3
-  - `openbmb/MiniCPM3-4B`, etc.
-  - ✅︎
-  - ✅︎
-* - `MistralForCausalLM`
-  - Mistral, Mistral-Instruct
-  - `mistralai/Mistral-7B-v0.1`, `mistralai/Mistral-7B-Instruct-v0.1`, etc.
-  - ✅︎
-  - ✅︎
-* - `MixtralForCausalLM`
-  - Mixtral-8x7B, Mixtral-8x7B-Instruct
-  - `mistralai/Mixtral-8x7B-v0.1`, `mistralai/Mixtral-8x7B-Instruct-v0.1`, `mistral-community/Mixtral-8x22B-v0.1`, etc.
-  - ✅︎
-  - ✅︎
-* - `MPTForCausalLM`
-  - MPT, MPT-Instruct, MPT-Chat, MPT-StoryWriter
-  - `mosaicml/mpt-7b`, `mosaicml/mpt-7b-storywriter`, `mosaicml/mpt-30b`, etc.
-  -
-  - ✅︎
-* - `NemotronForCausalLM`
-  - Nemotron-3, Nemotron-4, Minitron
-  - `nvidia/Minitron-8B-Base`, `mgoin/Nemotron-4-340B-Base-hf-FP8`, etc.
-  - ✅︎
-  - ✅︎
-* - `OLMoForCausalLM`
-  - OLMo
-  - `allenai/OLMo-1B-hf`, `allenai/OLMo-7B-hf`, etc.
-  -
-  - ✅︎
-* - `OLMo2ForCausalLM`
-  - OLMo2
-  - `allenai/OLMo2-7B-1124`, etc.
-  -
-  - ✅︎
-* - `OLMoEForCausalLM`
-  - OLMoE
-  - `allenai/OLMoE-1B-7B-0924`, `allenai/OLMoE-1B-7B-0924-Instruct`, etc.
-  - ✅︎
-  - ✅︎
-* - `OPTForCausalLM`
-  - OPT, OPT-IML
-  - `facebook/opt-66b`, `facebook/opt-iml-max-30b`, etc.
-  -
-  - ✅︎
-* - `OrionForCausalLM`
-  - Orion
-  - `OrionStarAI/Orion-14B-Base`, `OrionStarAI/Orion-14B-Chat`, etc.
-  -
-  - ✅︎
-* - `PhiForCausalLM`
-  - Phi
-  - `microsoft/phi-1_5`, `microsoft/phi-2`, etc.
-  - ✅︎
-  - ✅︎
-* - `Phi3ForCausalLM`
-  - Phi-4, Phi-3
-  - `microsoft/Phi-4`, `microsoft/Phi-3-mini-4k-instruct`, `microsoft/Phi-3-mini-128k-instruct`, `microsoft/Phi-3-medium-128k-instruct`, etc.
-  - ✅︎
-  - ✅︎
-* - `Phi3SmallForCausalLM`
-  - Phi-3-Small
-  - `microsoft/Phi-3-small-8k-instruct`, `microsoft/Phi-3-small-128k-instruct`, etc.
-  -
-  - ✅︎
-* - `PhiMoEForCausalLM`
-  - Phi-3.5-MoE
-  - `microsoft/Phi-3.5-MoE-instruct`, etc.
-  - ✅︎
-  - ✅︎
-* - `PersimmonForCausalLM`
-  - Persimmon
-  - `adept/persimmon-8b-base`, `adept/persimmon-8b-chat`, etc.
-  -
-  - ✅︎
-* - `QWenLMHeadModel`
-  - Qwen
-  - `Qwen/Qwen-7B`, `Qwen/Qwen-7B-Chat`, etc.
-  - ✅︎
-  - ✅︎
-* - `Qwen2ForCausalLM`
-  - QwQ, Qwen2
-  - `Qwen/QwQ-32B-Preview`, `Qwen/Qwen2-7B-Instruct`, `Qwen/Qwen2-7B`, etc.
-  - ✅︎
-  - ✅︎
-* - `Qwen2MoeForCausalLM`
-  - Qwen2MoE
-  - `Qwen/Qwen1.5-MoE-A2.7B`, `Qwen/Qwen1.5-MoE-A2.7B-Chat`, etc.
-  -
-  - ✅︎
-* - `StableLmForCausalLM`
-  - StableLM
-  - `stabilityai/stablelm-3b-4e1t`, `stabilityai/stablelm-base-alpha-7b-v2`, etc.
-  -
-  - ✅︎
-* - `Starcoder2ForCausalLM`
-  - Starcoder2
-  - `bigcode/starcoder2-3b`, `bigcode/starcoder2-7b`, `bigcode/starcoder2-15b`, etc.
-  -
-  - ✅︎
-* - `SolarForCausalLM`
-  - Solar Pro
-  - `upstage/solar-pro-preview-instruct`, etc.
-  - ✅︎
-  - ✅︎
-* - `TeleChat2ForCausalLM`
-  - TeleChat2
-  - `TeleAI/TeleChat2-3B`, `TeleAI/TeleChat2-7B`, `TeleAI/TeleChat2-35B`, etc.
-  - ✅︎
-  - ✅︎
-* - `XverseForCausalLM`
-  - XVERSE
-  - `xverse/XVERSE-7B-Chat`, `xverse/XVERSE-13B-Chat`, `xverse/XVERSE-65B-Chat`, etc.
-  - ✅︎
-  - ✅︎
-```
+- * Architecture
+  * Models
+  * Example HF Models
+  * [LoRA](#lora-adapter)
+  * [PP](#distributed-serving)
+- * `AquilaForCausalLM`
+  * Aquila, Aquila2
+  * `BAAI/Aquila-7B`, `BAAI/AquilaChat-7B`, etc.
+  * ✅︎
+  * ✅︎
+- * `ArcticForCausalLM`
+  * Arctic
+  * `Snowflake/snowflake-arctic-base`, `Snowflake/snowflake-arctic-instruct`, etc.
+  *
+  * ✅︎
+- * `BaiChuanForCausalLM`
+  * Baichuan2, Baichuan
+  * `baichuan-inc/Baichuan2-13B-Chat`, `baichuan-inc/Baichuan-7B`, etc.
+  * ✅︎
+  * ✅︎
+- * `BloomForCausalLM`
+  * BLOOM, BLOOMZ, BLOOMChat
+  * `bigscience/bloom`, `bigscience/bloomz`, etc.
+  *
+  * ✅︎
+- * `BartForConditionalGeneration`
+  * BART
+  * `facebook/bart-base`, `facebook/bart-large-cnn`, etc.
+  *
+  *
+- * `ChatGLMModel`
+  * ChatGLM
+  * `THUDM/chatglm2-6b`, `THUDM/chatglm3-6b`, etc.
+  * ✅︎
+  * ✅︎
+- * `CohereForCausalLM`, `Cohere2ForCausalLM`
+  * Command-R
+  * `CohereForAI/c4ai-command-r-v01`, `CohereForAI/c4ai-command-r7b-12-2024`, etc.
+  * ✅︎
+  * ✅︎
+- * `DbrxForCausalLM`
+  * DBRX
+  * `databricks/dbrx-base`, `databricks/dbrx-instruct`, etc.
+  *
+  * ✅︎
+- * `DeciLMForCausalLM`
+  * DeciLM
+  * `Deci/DeciLM-7B`, `Deci/DeciLM-7B-instruct`, etc.
+  *
+  * ✅︎
+- * `DeepseekForCausalLM`
+  * DeepSeek
+  * `deepseek-ai/deepseek-llm-67b-base`, `deepseek-ai/deepseek-llm-7b-chat` etc.
+  *
+  * ✅︎
+- * `DeepseekV2ForCausalLM`
+  * DeepSeek-V2
+  * `deepseek-ai/DeepSeek-V2`, `deepseek-ai/DeepSeek-V2-Chat` etc.
+  *
+  * ✅︎
+- * `DeepseekV3ForCausalLM`
+  * DeepSeek-V3
+  * `deepseek-ai/DeepSeek-V3-Base`, `deepseek-ai/DeepSeek-V3` etc.
+  *
+  * ✅︎
+- * `ExaoneForCausalLM`
+  * EXAONE-3
+  * `LGAI-EXAONE/EXAONE-3.0-7.8B-Instruct`, etc.
+  * ✅︎
+  * ✅︎
+- * `FalconForCausalLM`
+  * Falcon
+  * `tiiuae/falcon-7b`, `tiiuae/falcon-40b`, `tiiuae/falcon-rw-7b`, etc.
+  *
+  * ✅︎
+- * `FalconMambaForCausalLM`
+  * FalconMamba
+  * `tiiuae/falcon-mamba-7b`, `tiiuae/falcon-mamba-7b-instruct`, etc.
+  * ✅︎
+  * ✅︎
+- * `GemmaForCausalLM`
+  * Gemma
+  * `google/gemma-2b`, `google/gemma-7b`, etc.
+  * ✅︎
+  * ✅︎
+- * `Gemma2ForCausalLM`
+  * Gemma2
+  * `google/gemma-2-9b`, `google/gemma-2-27b`, etc.
+  * ✅︎
+  * ✅︎
+- * `GlmForCausalLM`
+  * GLM-4
+  * `THUDM/glm-4-9b-chat-hf`, etc.
+  * ✅︎
+  * ✅︎
+- * `GPT2LMHeadModel`
+  * GPT-2
+  * `gpt2`, `gpt2-xl`, etc.
+  *
+  * ✅︎
+- * `GPTBigCodeForCausalLM`
+  * StarCoder, SantaCoder, WizardCoder
+  * `bigcode/starcoder`, `bigcode/gpt_bigcode-santacoder`, `WizardLM/WizardCoder-15B-V1.0`, etc.
+  * ✅︎
+  * ✅︎
+- * `GPTJForCausalLM`
+  * GPT-J
+  * `EleutherAI/gpt-j-6b`, `nomic-ai/gpt4all-j`, etc.
+  *
+  * ✅︎
+- * `GPTNeoXForCausalLM`
+  * GPT-NeoX, Pythia, OpenAssistant, Dolly V2, StableLM
+  * `EleutherAI/gpt-neox-20b`, `EleutherAI/pythia-12b`, `OpenAssistant/oasst-sft-4-pythia-12b-epoch-3.5`, `databricks/dolly-v2-12b`, `stabilityai/stablelm-tuned-alpha-7b`, etc.
+  *
+  * ✅︎
+- * `GraniteForCausalLM`
+  * Granite 3.0, Granite 3.1, PowerLM
+  * `ibm-granite/granite-3.0-2b-base`, `ibm-granite/granite-3.1-8b-instruct`, `ibm/PowerLM-3b`, etc.
+  * ✅︎
+  * ✅︎
+- * `GraniteMoeForCausalLM`
+  * Granite 3.0 MoE, PowerMoE
+  * `ibm-granite/granite-3.0-1b-a400m-base`, `ibm-granite/granite-3.0-3b-a800m-instruct`, `ibm/PowerMoE-3b`, etc.
+  * ✅︎
+  * ✅︎
+- * `GritLM`
+  * GritLM
+  * `parasail-ai/GritLM-7B-vllm`.
+  * ✅︎
+  * ✅︎
+- * `InternLMForCausalLM`
+  * InternLM
+  * `internlm/internlm-7b`, `internlm/internlm-chat-7b`, etc.
+  * ✅︎
+  * ✅︎
+- * `InternLM2ForCausalLM`
+  * InternLM2
+  * `internlm/internlm2-7b`, `internlm/internlm2-chat-7b`, etc.
+  * ✅︎
+  * ✅︎
+- * `InternLM3ForCausalLM`
+  * InternLM3
+  * `internlm/internlm3-8b-instruct`, etc.
+  * ✅︎
+  * ✅︎
+- * `JAISLMHeadModel`
+  * Jais
+  * `inceptionai/jais-13b`, `inceptionai/jais-13b-chat`, `inceptionai/jais-30b-v3`, `inceptionai/jais-30b-chat-v3`, etc.
+  *
+  * ✅︎
+- * `JambaForCausalLM`
+  * Jamba
+  * `ai21labs/AI21-Jamba-1.5-Large`, `ai21labs/AI21-Jamba-1.5-Mini`, `ai21labs/Jamba-v0.1`, etc.
+  * ✅︎
+  * ✅︎
+- * `LlamaForCausalLM`
+  * Llama 3.1, Llama 3, Llama 2, LLaMA, Yi
+  * `meta-llama/Meta-Llama-3.1-405B-Instruct`, `meta-llama/Meta-Llama-3.1-70B`, `meta-llama/Meta-Llama-3-70B-Instruct`, `meta-llama/Llama-2-70b-hf`, `01-ai/Yi-34B`, etc.
+  * ✅︎
+  * ✅︎
+- * `MambaForCausalLM`
+  * Mamba
+  * `state-spaces/mamba-130m-hf`, `state-spaces/mamba-790m-hf`, `state-spaces/mamba-2.8b-hf`, etc.
+  *
+  * ✅︎
+- * `MiniCPMForCausalLM`
+  * MiniCPM
+  * `openbmb/MiniCPM-2B-sft-bf16`, `openbmb/MiniCPM-2B-dpo-bf16`, `openbmb/MiniCPM-S-1B-sft`, etc.
+  * ✅︎
+  * ✅︎
+- * `MiniCPM3ForCausalLM`
+  * MiniCPM3
+  * `openbmb/MiniCPM3-4B`, etc.
+  * ✅︎
+  * ✅︎
+- * `MistralForCausalLM`
+  * Mistral, Mistral-Instruct
+  * `mistralai/Mistral-7B-v0.1`, `mistralai/Mistral-7B-Instruct-v0.1`, etc.
+  * ✅︎
+  * ✅︎
+- * `MixtralForCausalLM`
+  * Mixtral-8x7B, Mixtral-8x7B-Instruct
+  * `mistralai/Mixtral-8x7B-v0.1`, `mistralai/Mixtral-8x7B-Instruct-v0.1`, `mistral-community/Mixtral-8x22B-v0.1`, etc.
+  * ✅︎
+  * ✅︎
+- * `MPTForCausalLM`
+  * MPT, MPT-Instruct, MPT-Chat, MPT-StoryWriter
+  * `mosaicml/mpt-7b`, `mosaicml/mpt-7b-storywriter`, `mosaicml/mpt-30b`, etc.
+  *
+  * ✅︎
+- * `NemotronForCausalLM`
+  * Nemotron-3, Nemotron-4, Minitron
+  * `nvidia/Minitron-8B-Base`, `mgoin/Nemotron-4-340B-Base-hf-FP8`, etc.
+  * ✅︎
+  * ✅︎
+- * `OLMoForCausalLM`
+  * OLMo
+  * `allenai/OLMo-1B-hf`, `allenai/OLMo-7B-hf`, etc.
+  *
+  * ✅︎
+- * `OLMo2ForCausalLM`
+  * OLMo2
+  * `allenai/OLMo2-7B-1124`, etc.
+  *
+  * ✅︎
+- * `OLMoEForCausalLM`
+  * OLMoE
+  * `allenai/OLMoE-1B-7B-0924`, `allenai/OLMoE-1B-7B-0924-Instruct`, etc.
+  * ✅︎
+  * ✅︎
+- * `OPTForCausalLM`
+  * OPT, OPT-IML
+  * `facebook/opt-66b`, `facebook/opt-iml-max-30b`, etc.
+  *
+  * ✅︎
+- * `OrionForCausalLM`
+  * Orion
+  * `OrionStarAI/Orion-14B-Base`, `OrionStarAI/Orion-14B-Chat`, etc.
+  *
+  * ✅︎
+- * `PhiForCausalLM`
+  * Phi
+  * `microsoft/phi-1_5`, `microsoft/phi-2`, etc.
+  * ✅︎
+  * ✅︎
+- * `Phi3ForCausalLM`
+  * Phi-4, Phi-3
+  * `microsoft/Phi-4`, `microsoft/Phi-3-mini-4k-instruct`, `microsoft/Phi-3-mini-128k-instruct`, `microsoft/Phi-3-medium-128k-instruct`, etc.
+  * ✅︎
+  * ✅︎
+- * `Phi3SmallForCausalLM`
+  * Phi-3-Small
+  * `microsoft/Phi-3-small-8k-instruct`, `microsoft/Phi-3-small-128k-instruct`, etc.
+  *
+  * ✅︎
+- * `PhiMoEForCausalLM`
+  * Phi-3.5-MoE
+  * `microsoft/Phi-3.5-MoE-instruct`, etc.
+  * ✅︎
+  * ✅︎
+- * `PersimmonForCausalLM`
+  * Persimmon
+  * `adept/persimmon-8b-base`, `adept/persimmon-8b-chat`, etc.
+  *
+  * ✅︎
+- * `QWenLMHeadModel`
+  * Qwen
+  * `Qwen/Qwen-7B`, `Qwen/Qwen-7B-Chat`, etc.
+  * ✅︎
+  * ✅︎
+- * `Qwen2ForCausalLM`
+  * QwQ, Qwen2
+  * `Qwen/QwQ-32B-Preview`, `Qwen/Qwen2-7B-Instruct`, `Qwen/Qwen2-7B`, etc.
+  * ✅︎
+  * ✅︎
+- * `Qwen2MoeForCausalLM`
+  * Qwen2MoE
+  * `Qwen/Qwen1.5-MoE-A2.7B`, `Qwen/Qwen1.5-MoE-A2.7B-Chat`, etc.
+  *
+  * ✅︎
+- * `StableLmForCausalLM`
+  * StableLM
+  * `stabilityai/stablelm-3b-4e1t`, `stabilityai/stablelm-base-alpha-7b-v2`, etc.
+  *
+  * ✅︎
+- * `Starcoder2ForCausalLM`
+  * Starcoder2
+  * `bigcode/starcoder2-3b`, `bigcode/starcoder2-7b`, `bigcode/starcoder2-15b`, etc.
+  *
+  * ✅︎
+- * `SolarForCausalLM`
+  * Solar Pro
+  * `upstage/solar-pro-preview-instruct`, etc.
+  * ✅︎
+  * ✅︎
+- * `TeleChat2ForCausalLM`
+  * TeleChat2
+  * `TeleAI/TeleChat2-3B`, `TeleAI/TeleChat2-7B`, `TeleAI/TeleChat2-35B`, etc.
+  * ✅︎
+  * ✅︎
+- * `XverseForCausalLM`
+  * XVERSE
+  * `xverse/XVERSE-7B-Chat`, `xverse/XVERSE-13B-Chat`, `xverse/XVERSE-65B-Chat`, etc.
+  * ✅︎
+  * ✅︎
+:::
 
-```{note}
+:::{note}
 Currently, the ROCm version of vLLM supports Mistral and Mixtral only for context lengths up to 4096.
-```
+:::
 
 ### Pooling Models
 
 See [this page](pooling-models) for more information on how to use pooling models.
 
-```{important}
+:::{important}
 Since some model architectures support both generative and pooling tasks,
 you should explicitly specify the task type to ensure that the model is used in pooling mode instead of generative mode.
-```
+:::
 
 #### Text Embedding (`--task embed`)
 
-```{list-table}
+:::{list-table}
 :widths: 25 25 50 5 5
 :header-rows: 1
 
-* - Architecture
-  - Models
-  - Example HF Models
-  - [LoRA](#lora-adapter)
-  - [PP](#distributed-serving)
-* - `BertModel`
-  - BERT-based
-  - `BAAI/bge-base-en-v1.5`, etc.
-  -
-  -
-* - `Gemma2Model`
-  - Gemma2-based
-  - `BAAI/bge-multilingual-gemma2`, etc.
-  -
-  - ✅︎
-* - `GritLM`
-  - GritLM
-  - `parasail-ai/GritLM-7B-vllm`.
-  - ✅︎
-  - ✅︎
-* - `LlamaModel`, `LlamaForCausalLM`, `MistralModel`, etc.
-  - Llama-based
-  - `intfloat/e5-mistral-7b-instruct`, etc.
-  - ✅︎
-  - ✅︎
-* - `Qwen2Model`, `Qwen2ForCausalLM`
-  - Qwen2-based
-  - `ssmits/Qwen2-7B-Instruct-embed-base` (see note), `Alibaba-NLP/gte-Qwen2-7B-instruct` (see note), etc.
-  - ✅︎
-  - ✅︎
-* - `RobertaModel`, `RobertaForMaskedLM`
-  - RoBERTa-based
-  - `sentence-transformers/all-roberta-large-v1`, `sentence-transformers/all-roberta-large-v1`, etc.
-  -
-  -
-* - `XLMRobertaModel`
-  - XLM-RoBERTa-based
-  - `intfloat/multilingual-e5-large`, etc.
-  -
-  -
-```
+- * Architecture
+  * Models
+  * Example HF Models
+  * [LoRA](#lora-adapter)
+  * [PP](#distributed-serving)
+- * `BertModel`
+  * BERT-based
+  * `BAAI/bge-base-en-v1.5`, etc.
+  *
+  *
+- * `Gemma2Model`
+  * Gemma2-based
+  * `BAAI/bge-multilingual-gemma2`, etc.
+  *
+  * ✅︎
+- * `GritLM`
+  * GritLM
+  * `parasail-ai/GritLM-7B-vllm`.
+  * ✅︎
+  * ✅︎
+- * `LlamaModel`, `LlamaForCausalLM`, `MistralModel`, etc.
+  * Llama-based
+  * `intfloat/e5-mistral-7b-instruct`, etc.
+  * ✅︎
+  * ✅︎
+- * `Qwen2Model`, `Qwen2ForCausalLM`
+  * Qwen2-based
+  * `ssmits/Qwen2-7B-Instruct-embed-base` (see note), `Alibaba-NLP/gte-Qwen2-7B-instruct` (see note), etc.
+  * ✅︎
+  * ✅︎
+- * `RobertaModel`, `RobertaForMaskedLM`
+  * RoBERTa-based
+  * `sentence-transformers/all-roberta-large-v1`, `sentence-transformers/all-roberta-large-v1`, etc.
+  *
+  *
+- * `XLMRobertaModel`
+  * XLM-RoBERTa-based
+  * `intfloat/multilingual-e5-large`, etc.
+  *
+  *
+:::
 
-```{note}
+:::{note}
 `ssmits/Qwen2-7B-Instruct-embed-base` has an improperly defined Sentence Transformers config.
 You should manually set mean pooling by passing `--override-pooler-config '{"pooling_type": "MEAN"}'`.
-```
+:::
 
-```{note}
+:::{note}
 Unlike base Qwen2, `Alibaba-NLP/gte-Qwen2-7B-instruct` uses bi-directional attention.
 You can set `--hf-overrides '{"is_causal": false}'` to change the attention mask accordingly.
 
@@ -438,7 +514,7 @@ despite being described otherwise on its model card.
 
 Regardless of the variant, you need to enable `--trust-remote-code` for the correct tokenizer to be
 loaded. See [relevant issue on HF Transformers](https://github.com/huggingface/transformers/issues/34882).
-```
+:::
 
 If your model is not in the above list, we will try to automatically convert the model using
 {func}`~vllm.model_executor.models.adapters.as_embedding_model`. By default, the embeddings
@@ -446,98 +522,98 @@ of the whole prompt are extracted from the normalized hidden state corresponding
 
 #### Reward Modeling (`--task reward`)
 
-```{list-table}
+:::{list-table}
 :widths: 25 25 50 5 5
 :header-rows: 1
 
-* - Architecture
-  - Models
-  - Example HF Models
-  - [LoRA](#lora-adapter)
-  - [PP](#distributed-serving)
-* - `InternLM2ForRewardModel`
-  - InternLM2-based
-  - `internlm/internlm2-1_8b-reward`, `internlm/internlm2-7b-reward`, etc.
-  - ✅︎
-  - ✅︎
-* - `LlamaForCausalLM`
-  - Llama-based
-  - `peiyi9979/math-shepherd-mistral-7b-prm`, etc.
-  - ✅︎
-  - ✅︎
-* - `Qwen2ForRewardModel`
-  - Qwen2-based
-  - `Qwen/Qwen2.5-Math-RM-72B`, etc.
-  - ✅︎
-  - ✅︎
-* - `Qwen2ForProcessRewardModel`
-  - Qwen2-based
-  - `Qwen/Qwen2.5-Math-PRM-7B`, `Qwen/Qwen2.5-Math-PRM-72B`, etc.
-  - ✅︎
-  - ✅︎
-```
+- * Architecture
+  * Models
+  * Example HF Models
+  * [LoRA](#lora-adapter)
+  * [PP](#distributed-serving)
+- * `InternLM2ForRewardModel`
+  * InternLM2-based
+  * `internlm/internlm2-1_8b-reward`, `internlm/internlm2-7b-reward`, etc.
+  * ✅︎
+  * ✅︎
+- * `LlamaForCausalLM`
+  * Llama-based
+  * `peiyi9979/math-shepherd-mistral-7b-prm`, etc.
+  * ✅︎
+  * ✅︎
+- * `Qwen2ForRewardModel`
+  * Qwen2-based
+  * `Qwen/Qwen2.5-Math-RM-72B`, etc.
+  * ✅︎
+  * ✅︎
+- * `Qwen2ForProcessRewardModel`
+  * Qwen2-based
+  * `Qwen/Qwen2.5-Math-PRM-7B`, `Qwen/Qwen2.5-Math-PRM-72B`, etc.
+  * ✅︎
+  * ✅︎
+:::
 
 If your model is not in the above list, we will try to automatically convert the model using
 {func}`~vllm.model_executor.models.adapters.as_reward_model`. By default, we return the hidden states of each token directly.
 
-```{important}
+:::{important}
 For process-supervised reward models such as `peiyi9979/math-shepherd-mistral-7b-prm`, the pooling config should be set explicitly,
 e.g.: `--override-pooler-config '{"pooling_type": "STEP", "step_tag_id": 123, "returned_token_ids": [456, 789]}'`.
-```
+:::
 
 #### Classification (`--task classify`)
 
-```{list-table}
+:::{list-table}
 :widths: 25 25 50 5 5
 :header-rows: 1
 
-* - Architecture
-  - Models
-  - Example HF Models
-  - [LoRA](#lora-adapter)
-  - [PP](#distributed-serving)
-* - `JambaForSequenceClassification`
-  - Jamba
-  - `ai21labs/Jamba-tiny-reward-dev`, etc.
-  - ✅︎
-  - ✅︎
-* - `Qwen2ForSequenceClassification`
-  - Qwen2-based
-  - `jason9693/Qwen2.5-1.5B-apeach`, etc.
-  - ✅︎
-  - ✅︎
-```
+- * Architecture
+  * Models
+  * Example HF Models
+  * [LoRA](#lora-adapter)
+  * [PP](#distributed-serving)
+- * `JambaForSequenceClassification`
+  * Jamba
+  * `ai21labs/Jamba-tiny-reward-dev`, etc.
+  * ✅︎
+  * ✅︎
+- * `Qwen2ForSequenceClassification`
+  * Qwen2-based
+  * `jason9693/Qwen2.5-1.5B-apeach`, etc.
+  * ✅︎
+  * ✅︎
+:::
 
 If your model is not in the above list, we will try to automatically convert the model using
 {func}`~vllm.model_executor.models.adapters.as_classification_model`. By default, the class probabilities are extracted from the softmaxed hidden state corresponding to the last token.
 
 #### Sentence Pair Scoring (`--task score`)
 
-```{list-table}
+:::{list-table}
 :widths: 25 25 50 5 5
 :header-rows: 1
 
-* - Architecture
-  - Models
-  - Example HF Models
-  - [LoRA](#lora-adapter)
-  - [PP](#distributed-serving)
-* - `BertForSequenceClassification`
-  - BERT-based
-  - `cross-encoder/ms-marco-MiniLM-L-6-v2`, etc.
-  -
-  -
-* - `RobertaForSequenceClassification`
-  - RoBERTa-based
-  - `cross-encoder/quora-roberta-base`, etc.
-  -
-  -
-* - `XLMRobertaForSequenceClassification`
-  - XLM-RoBERTa-based
-  - `BAAI/bge-reranker-v2-m3`, etc.
-  -
-  -
-```
+- * Architecture
+  * Models
+  * Example HF Models
+  * [LoRA](#lora-adapter)
+  * [PP](#distributed-serving)
+- * `BertForSequenceClassification`
+  * BERT-based
+  * `cross-encoder/ms-marco-MiniLM-L-6-v2`, etc.
+  *
+  *
+- * `RobertaForSequenceClassification`
+  * RoBERTa-based
+  * `cross-encoder/quora-roberta-base`, etc.
+  *
+  *
+- * `XLMRobertaForSequenceClassification`
+  * XLM-RoBERTa-based
+  * `BAAI/bge-reranker-v2-m3`, etc.
+  *
+  *
+:::
 
 (supported-mm-models)=
 
@@ -560,11 +636,12 @@ On the other hand, modalities separated by `/` are mutually exclusive.
 
 See [this page](#multimodal-inputs) on how to pass multi-modal inputs to the model.
 
-````{important}
+:::{important}
 To enable multiple multi-modal items per text prompt, you have to set `limit_mm_per_prompt` (offline inference)
 or `--limit-mm-per-prompt` (online serving). For example, to enable passing up to 4 images per text prompt:
 
 Offline inference:
+
 ```python
 llm = LLM(
     model="Qwen/Qwen2-VL-7B-Instruct",
@@ -573,14 +650,16 @@ llm = LLM(
 ```
 
 Online serving:
+
 ```bash
 vllm serve Qwen/Qwen2-VL-7B-Instruct --limit-mm-per-prompt image=4
 ```
-````
 
-```{note}
+:::
+
+:::{note}
 vLLM currently only supports adding LoRA to the language backbone of multimodal models.
-```
+:::
 
 ### Generative Models
 
@@ -588,256 +667,263 @@ See [this page](#generative-models) for more information on how to use generativ
 
 #### Text Generation (`--task generate`)
 
-```{list-table}
+:::{list-table}
 :widths: 25 25 15 20 5 5 5
 :header-rows: 1
 
-* - Architecture
-  - Models
-  - Inputs
-  - Example HF Models
-  - [LoRA](#lora-adapter)
-  - [PP](#distributed-serving)
-  - [V1](gh-issue:8779)
-* - `AriaForConditionalGeneration`
-  - Aria
-  - T + I<sup>+</sup>
-  - `rhymes-ai/Aria`
-  -
-  - ✅︎
-  - ✅︎
-* - `Blip2ForConditionalGeneration`
-  - BLIP-2
-  - T + I<sup>E</sup>
-  - `Salesforce/blip2-opt-2.7b`, `Salesforce/blip2-opt-6.7b`, etc.
-  -
-  - ✅︎
-  - ✅︎
-* - `ChameleonForConditionalGeneration`
-  - Chameleon
-  - T + I
-  - `facebook/chameleon-7b` etc.
-  -
-  - ✅︎
-  - ✅︎
-* - `DeepseekVLV2ForCausalLM`
-  - DeepSeek-VL2
-  - T + I<sup>+</sup>
-  - `deepseek-ai/deepseek-vl2-tiny`, `deepseek-ai/deepseek-vl2-small`, `deepseek-ai/deepseek-vl2` etc. (see note)
-  -
-  - ✅︎
-  - ✅︎
-* - `FuyuForCausalLM`
-  - Fuyu
-  - T + I
-  - `adept/fuyu-8b` etc.
-  -
-  - ✅︎
-  - ✅︎
-* - `ChatGLMModel`
-  - GLM-4V
-  - T + I
-  - `THUDM/glm-4v-9b` etc.
-  - ✅︎
-  - ✅︎
-  -
-* - `H2OVLChatModel`
-  - H2OVL
-  - T + I<sup>E+</sup>
-  - `h2oai/h2ovl-mississippi-800m`, `h2oai/h2ovl-mississippi-2b`, etc.
-  -
-  - ✅︎
-  -
-* - `Idefics3ForConditionalGeneration`
-  - Idefics3
-  - T + I
-  - `HuggingFaceM4/Idefics3-8B-Llama3` etc.
-  - ✅︎
-  -
-  -
-* - `InternVLChatModel`
-  - InternVL 2.5, Mono-InternVL, InternVL 2.0
-  - T + I<sup>E+</sup>
-  - `OpenGVLab/InternVL2_5-4B`, `OpenGVLab/Mono-InternVL-2B`, `OpenGVLab/InternVL2-4B`, etc.
-  -
-  - ✅︎
-  - ✅︎
-* - `LlavaForConditionalGeneration`
-  - LLaVA-1.5
-  - T + I<sup>E+</sup>
-  - `llava-hf/llava-1.5-7b-hf`, `TIGER-Lab/Mantis-8B-siglip-llama3` (see note), etc.
-  -
-  - ✅︎
-  - ✅︎
-* - `LlavaNextForConditionalGeneration`
-  - LLaVA-NeXT
-  - T + I<sup>E+</sup>
-  - `llava-hf/llava-v1.6-mistral-7b-hf`, `llava-hf/llava-v1.6-vicuna-7b-hf`, etc.
-  -
-  - ✅︎
-  - ✅︎
-* - `LlavaNextVideoForConditionalGeneration`
-  - LLaVA-NeXT-Video
-  - T + V
-  - `llava-hf/LLaVA-NeXT-Video-7B-hf`, etc.
-  -
-  - ✅︎
-  - ✅︎
-* - `LlavaOnevisionForConditionalGeneration`
-  - LLaVA-Onevision
-  - T + I<sup>+</sup> + V<sup>+</sup>
-  - `llava-hf/llava-onevision-qwen2-7b-ov-hf`, `llava-hf/llava-onevision-qwen2-0.5b-ov-hf`, etc.
-  -
-  - ✅︎
-  - ✅︎
-* - `MiniCPMV`
-  - MiniCPM-V
-  - T + I<sup>E+</sup>
-  - `openbmb/MiniCPM-V-2` (see note), `openbmb/MiniCPM-Llama3-V-2_5`, `openbmb/MiniCPM-V-2_6`, etc.
-  - ✅︎
-  - ✅︎
-  -
-* - `MllamaForConditionalGeneration`
-  - Llama 3.2
-  - T + I<sup>+</sup>
-  - `meta-llama/Llama-3.2-90B-Vision-Instruct`, `meta-llama/Llama-3.2-11B-Vision`, etc.
-  -
-  -
-  -
-* - `MolmoForCausalLM`
-  - Molmo
-  - T + I
-  - `allenai/Molmo-7B-D-0924`, `allenai/Molmo-72B-0924`, etc.
-  - ✅︎
-  - ✅︎
-  - ✅︎
-* - `NVLM_D_Model`
-  - NVLM-D 1.0
-  - T + I<sup>E+</sup>
-  - `nvidia/NVLM-D-72B`, etc.
-  -
-  - ✅︎
-  - ✅︎
-* - `PaliGemmaForConditionalGeneration`
-  - PaliGemma, PaliGemma 2
-  - T + I<sup>E</sup>
-  - `google/paligemma-3b-pt-224`, `google/paligemma-3b-mix-224`, `google/paligemma2-3b-ft-docci-448`, etc.
-  -
-  - ✅︎
-  -
-* - `Phi3VForCausalLM`
-  - Phi-3-Vision, Phi-3.5-Vision
-  - T + I<sup>E+</sup>
-  - `microsoft/Phi-3-vision-128k-instruct`, `microsoft/Phi-3.5-vision-instruct`, etc.
-  -
-  - ✅︎
-  - ✅︎
-* - `PixtralForConditionalGeneration`
-  - Pixtral
-  - T + I<sup>+</sup>
-  - `mistralai/Pixtral-12B-2409`, `mistral-community/pixtral-12b` (see note), etc.
-  -
-  - ✅︎
-  - ✅︎
-* - `QWenLMHeadModel`
-  - Qwen-VL
-  - T + I<sup>E+</sup>
-  - `Qwen/Qwen-VL`, `Qwen/Qwen-VL-Chat`, etc.
-  - ✅︎
-  - ✅︎
-  -
-* - `Qwen2AudioForConditionalGeneration`
-  - Qwen2-Audio
-  - T + A<sup>+</sup>
-  - `Qwen/Qwen2-Audio-7B-Instruct`
-  -
-  - ✅︎
-  - ✅︎
-* - `Qwen2VLForConditionalGeneration`
-  - QVQ, Qwen2-VL
-  - T + I<sup>E+</sup> + V<sup>E+</sup>
-  - `Qwen/QVQ-72B-Preview`, `Qwen/Qwen2-VL-7B-Instruct`, `Qwen/Qwen2-VL-72B-Instruct`, etc.
-  - ✅︎
-  - ✅︎
-  - ✅︎
-* - `UltravoxModel`
-  - Ultravox
-  - T + A<sup>E+</sup>
-  - `fixie-ai/ultravox-v0_3`
-  -
-  - ✅︎
-  - ✅︎
-```
+- * Architecture
+  * Models
+  * Inputs
+  * Example HF Models
+  * [LoRA](#lora-adapter)
+  * [PP](#distributed-serving)
+  * [V1](gh-issue:8779)
+- * `AriaForConditionalGeneration`
+  * Aria
+  * T + I<sup>+</sup>
+  * `rhymes-ai/Aria`
+  *
+  * ✅︎
+  * ✅︎
+- * `Blip2ForConditionalGeneration`
+  * BLIP-2
+  * T + I<sup>E</sup>
+  * `Salesforce/blip2-opt-2.7b`, `Salesforce/blip2-opt-6.7b`, etc.
+  *
+  * ✅︎
+  * ✅︎
+- * `ChameleonForConditionalGeneration`
+  * Chameleon
+  * T + I
+  * `facebook/chameleon-7b` etc.
+  *
+  * ✅︎
+  * ✅︎
+- * `DeepseekVLV2ForCausalLM`
+  * DeepSeek-VL2
+  * T + I<sup>+</sup>
+  * `deepseek-ai/deepseek-vl2-tiny`, `deepseek-ai/deepseek-vl2-small`, `deepseek-ai/deepseek-vl2` etc. (see note)
+  *
+  * ✅︎
+  * ✅︎
+- * `FuyuForCausalLM`
+  * Fuyu
+  * T + I
+  * `adept/fuyu-8b` etc.
+  *
+  * ✅︎
+  * ✅︎
+- * `ChatGLMModel`
+  * GLM-4V
+  * T + I
+  * `THUDM/glm-4v-9b` etc.
+  * ✅︎
+  * ✅︎
+  *
+- * `H2OVLChatModel`
+  * H2OVL
+  * T + I<sup>E+</sup>
+  * `h2oai/h2ovl-mississippi-800m`, `h2oai/h2ovl-mississippi-2b`, etc.
+  *
+  * ✅︎
+  *
+- * `Idefics3ForConditionalGeneration`
+  * Idefics3
+  * T + I
+  * `HuggingFaceM4/Idefics3-8B-Llama3` etc.
+  * ✅︎
+  *
+  *
+- * `InternVLChatModel`
+  * InternVL 2.5, Mono-InternVL, InternVL 2.0
+  * T + I<sup>E+</sup>
+  * `OpenGVLab/InternVL2_5-4B`, `OpenGVLab/Mono-InternVL-2B`, `OpenGVLab/InternVL2-4B`, etc.
+  *
+  * ✅︎
+  * ✅︎
+- * `LlavaForConditionalGeneration`
+  * LLaVA-1.5
+  * T + I<sup>E+</sup>
+  * `llava-hf/llava-1.5-7b-hf`, `TIGER-Lab/Mantis-8B-siglip-llama3` (see note), etc.
+  *
+  * ✅︎
+  * ✅︎
+- * `LlavaNextForConditionalGeneration`
+  * LLaVA-NeXT
+  * T + I<sup>E+</sup>
+  * `llava-hf/llava-v1.6-mistral-7b-hf`, `llava-hf/llava-v1.6-vicuna-7b-hf`, etc.
+  *
+  * ✅︎
+  * ✅︎
+- * `LlavaNextVideoForConditionalGeneration`
+  * LLaVA-NeXT-Video
+  * T + V
+  * `llava-hf/LLaVA-NeXT-Video-7B-hf`, etc.
+  *
+  * ✅︎
+  * ✅︎
+- * `LlavaOnevisionForConditionalGeneration`
+  * LLaVA-Onevision
+  * T + I<sup>+</sup> + V<sup>+</sup>
+  * `llava-hf/llava-onevision-qwen2-7b-ov-hf`, `llava-hf/llava-onevision-qwen2-0.5b-ov-hf`, etc.
+  *
+  * ✅︎
+  * ✅︎
+- * `MiniCPMO`
+  * MiniCPM-O
+  * T + I<sup>E+</sup> + V<sup>E+</sup> + A<sup>E+</sup>
+  * `openbmb/MiniCPM-o-2_6`, etc.
+  * ✅︎
+  * ✅︎
+  *
+- * `MiniCPMV`
+  * MiniCPM-V
+  * T + I<sup>E+</sup> + V<sup>E+</sup>
+  * `openbmb/MiniCPM-V-2` (see note), `openbmb/MiniCPM-Llama3-V-2_5`, `openbmb/MiniCPM-V-2_6`, etc.
+  * ✅︎
+  * ✅︎
+  *
+- * `MllamaForConditionalGeneration`
+  * Llama 3.2
+  * T + I<sup>+</sup>
+  * `meta-llama/Llama-3.2-90B-Vision-Instruct`, `meta-llama/Llama-3.2-11B-Vision`, etc.
+  *
+  *
+  *
+- * `MolmoForCausalLM`
+  * Molmo
+  * T + I
+  * `allenai/Molmo-7B-D-0924`, `allenai/Molmo-72B-0924`, etc.
+  * ✅︎
+  * ✅︎
+  * ✅︎
+- * `NVLM_D_Model`
+  * NVLM-D 1.0
+  * T + I<sup>E+</sup>
+  * `nvidia/NVLM-D-72B`, etc.
+  *
+  * ✅︎
+  * ✅︎
+- * `PaliGemmaForConditionalGeneration`
+  * PaliGemma, PaliGemma 2
+  * T + I<sup>E</sup>
+  * `google/paligemma-3b-pt-224`, `google/paligemma-3b-mix-224`, `google/paligemma2-3b-ft-docci-448`, etc.
+  *
+  * ✅︎
+  *
+- * `Phi3VForCausalLM`
+  * Phi-3-Vision, Phi-3.5-Vision
+  * T + I<sup>E+</sup>
+  * `microsoft/Phi-3-vision-128k-instruct`, `microsoft/Phi-3.5-vision-instruct`, etc.
+  *
+  * ✅︎
+  * ✅︎
+- * `PixtralForConditionalGeneration`
+  * Pixtral
+  * T + I<sup>+</sup>
+  * `mistralai/Pixtral-12B-2409`, `mistral-community/pixtral-12b` (see note), etc.
+  *
+  * ✅︎
+  * ✅︎
+- * `QWenLMHeadModel`
+  * Qwen-VL
+  * T + I<sup>E+</sup>
+  * `Qwen/Qwen-VL`, `Qwen/Qwen-VL-Chat`, etc.
+  * ✅︎
+  * ✅︎
+  * ✅︎
+- * `Qwen2AudioForConditionalGeneration`
+  * Qwen2-Audio
+  * T + A<sup>+</sup>
+  * `Qwen/Qwen2-Audio-7B-Instruct`
+  *
+  * ✅︎
+  * ✅︎
+- * `Qwen2VLForConditionalGeneration`
+  * QVQ, Qwen2-VL
+  * T + I<sup>E+</sup> + V<sup>E+</sup>
+  * `Qwen/QVQ-72B-Preview`, `Qwen/Qwen2-VL-7B-Instruct`, `Qwen/Qwen2-VL-72B-Instruct`, etc.
+  * ✅︎
+  * ✅︎
+  * ✅︎
+- * `UltravoxModel`
+  * Ultravox
+  * T + A<sup>E+</sup>
+  * `fixie-ai/ultravox-v0_3`
+  *
+  * ✅︎
+  * ✅︎
+:::
 
 <sup>E</sup> Pre-computed embeddings can be inputted for this modality.  
 <sup>+</sup> Multiple items can be inputted per text prompt for this modality.
 
-```{note}
+:::{note}
 To use `DeepSeek-VL2` series models, you have to pass `--hf_overrides '{"architectures": ["DeepseekVLV2ForCausalLM"]}'` when running vLLM.
-```
+:::
 
-```{note}
+:::{note}
 To use `TIGER-Lab/Mantis-8B-siglip-llama3`, you have to pass `--hf_overrides '{"architectures": ["MantisForConditionalGeneration"]}'` when running vLLM.
-```
+:::
 
-```{note}
+:::{note}
 The official `openbmb/MiniCPM-V-2` doesn't work yet, so we need to use a fork (`HwwwH/MiniCPM-V-2`) for now.
 For more details, please see: <gh-pr:4087#issuecomment-2250397630>
-```
+:::
 
-```{note}
+:::{note}
 The chat template for Pixtral-HF is incorrect (see [discussion](https://huggingface.co/mistral-community/pixtral-12b/discussions/22)).
 A corrected version is available at <gh-file:examples/template_pixtral_hf.jinja>.
-```
+:::
 
 ### Pooling Models
 
 See [this page](pooling-models) for more information on how to use pooling models.
 
-```{important}
+:::{important}
 Since some model architectures support both generative and pooling tasks,
 you should explicitly specify the task type to ensure that the model is used in pooling mode instead of generative mode.
-```
+:::
 
 #### Text Embedding (`--task embed`)
 
 Any text generation model can be converted into an embedding model by passing `--task embed`.
 
-```{note}
+:::{note}
 To get the best results, you should use pooling models that are specifically trained as such.
-```
+:::
 
 The following table lists those that are tested in vLLM.
 
-```{list-table}
+:::{list-table}
 :widths: 25 25 15 25 5 5
 :header-rows: 1
 
-* - Architecture
-  - Models
-  - Inputs
-  - Example HF Models
-  - [LoRA](#lora-adapter)
-  - [PP](#distributed-serving)
-* - `LlavaNextForConditionalGeneration`
-  - LLaVA-NeXT-based
-  - T / I
-  - `royokong/e5-v`
-  -
-  - ✅︎
-* - `Phi3VForCausalLM`
-  - Phi-3-Vision-based
-  - T + I
-  - `TIGER-Lab/VLM2Vec-Full`
-  - 🚧
-  - ✅︎
-* - `Qwen2VLForConditionalGeneration`
-  - Qwen2-VL-based
-  - T + I
-  - `MrLight/dse-qwen2-2b-mrl-v1`
-  -
-  - ✅︎
-```
+- * Architecture
+  * Models
+  * Inputs
+  * Example HF Models
+  * [LoRA](#lora-adapter)
+  * [PP](#distributed-serving)
+- * `LlavaNextForConditionalGeneration`
+  * LLaVA-NeXT-based
+  * T / I
+  * `royokong/e5-v`
+  *
+  * ✅︎
+- * `Phi3VForCausalLM`
+  * Phi-3-Vision-based
+  * T + I
+  * `TIGER-Lab/VLM2Vec-Full`
+  * 🚧
+  * ✅︎
+- * `Qwen2VLForConditionalGeneration`
+  * Qwen2-VL-based
+  * T + I
+  * `MrLight/dse-qwen2-2b-mrl-v1`
+  *
+  * ✅︎
+:::
 
 _________________
 
@@ -849,9 +935,9 @@ At vLLM, we are committed to facilitating the integration and support of third-p
 
 2. **Best-Effort Consistency**: While we aim to maintain a level of consistency between the models implemented in vLLM and other frameworks like transformers, complete alignment is not always feasible. Factors like acceleration techniques and the use of low-precision computations can introduce discrepancies. Our commitment is to ensure that the implemented models are functional and produce sensible results.
 
-    ```{tip}
+    :::{tip}
     When comparing the output of `model.generate` from HuggingFace Transformers with the output of `llm.generate` from vLLM, note that the former reads the model's generation config file (i.e., [generation_config.json](https://github.com/huggingface/transformers/blob/19dabe96362803fb0a9ae7073d03533966598b17/src/transformers/generation/utils.py#L1945)) and applies the default parameters for generation, while the latter only uses the parameters passed to the function. Ensure all sampling parameters are identical when comparing outputs.
-    ```
+    :::
 
 3. **Issue Resolution and Model Updates**: Users are encouraged to report any bugs or issues they encounter with third-party models. Proposed fixes should be submitted via PRs, with a clear explanation of the problem and the rationale behind the proposed solution. If a fix for one model impacts another, we rely on the community to highlight and address these cross-model dependencies. Note: for bugfix PRs, it is good etiquette to inform the original author to seek their feedback.
 
diff --git a/docs/source/serving/distributed_serving.md b/docs/source/serving/distributed_serving.md
index daf6e2f250416..6d136147c8dd2 100644
--- a/docs/source/serving/distributed_serving.md
+++ b/docs/source/serving/distributed_serving.md
@@ -14,9 +14,9 @@ In short, you should increase the number of GPUs and the number of nodes until y
 
 After adding enough GPUs and nodes to hold the model, you can run vLLM first, which will print some logs like `# GPU blocks: 790`. Multiply the number by `16` (the block size), and you can get roughly the maximum number of tokens that can be served on the current configuration. If this number is not satisfying, e.g. you want higher throughput, you can further increase the number of GPUs or nodes, until the number of blocks is enough.
 
-```{note}
+:::{note}
 There is one edge case: if the model fits in a single node with multiple GPUs, but the number of GPUs cannot divide the model size evenly, you can use pipeline parallelism, which splits the model along layers and supports uneven splits. In this case, the tensor parallel size should be 1 and the pipeline parallel size should be the number of GPUs.
-```
+:::
 
 ## Running vLLM on a single node
 
@@ -60,7 +60,8 @@ bash run_cluster.sh \
                 vllm/vllm-openai \
                 ip_of_head_node \
                 --head \
-                /path/to/the/huggingface/home/in/this/node
+                /path/to/the/huggingface/home/in/this/node \
+                -e VLLM_HOST_IP=ip_of_this_node
 ```
 
 On the rest of the worker nodes, run the following command:
@@ -70,10 +71,11 @@ bash run_cluster.sh \
                 vllm/vllm-openai \
                 ip_of_head_node \
                 --worker \
-                /path/to/the/huggingface/home/in/this/node
+                /path/to/the/huggingface/home/in/this/node \
+                -e VLLM_HOST_IP=ip_of_this_node
 ```
 
-Then you get a ray cluster of containers. Note that you need to keep the shells running these commands alive to hold the cluster. Any shell disconnect will terminate the cluster. In addition, please note that the argument `ip_of_head_node` should be the IP address of the head node, which is accessible by all the worker nodes. A common misunderstanding is to use the IP address of the worker node, which is not correct.
+Then you get a ray cluster of containers. Note that you need to keep the shells running these commands alive to hold the cluster. Any shell disconnect will terminate the cluster. In addition, please note that the argument `ip_of_head_node` should be the IP address of the head node, which is accessible by all the worker nodes. The IP addresses of each worker node should be specified in the `VLLM_HOST_IP` environment variable, and should be different for each worker node. Please check the network configuration of your cluster to make sure the nodes can communicate with each other through the specified IP addresses.
 
 Then, on any node, use `docker exec -it node /bin/bash` to enter the container, execute `ray status` to check the status of the Ray cluster. You should see the right number of nodes and GPUs.
 
@@ -94,12 +96,16 @@ vllm serve /path/to/the/model/in/the/container \
 
 To make tensor parallel performant, you should make sure the communication between nodes is efficient, e.g. using high-speed network cards like Infiniband. To correctly set up the cluster to use Infiniband, append additional arguments like `--privileged -e NCCL_IB_HCA=mlx5` to the `run_cluster.sh` script. Please contact your system administrator for more information on how to set up the flags. One way to confirm if the Infiniband is working is to run vLLM with `NCCL_DEBUG=TRACE` environment variable set, e.g. `NCCL_DEBUG=TRACE vllm serve ...` and check the logs for the NCCL version and the network used. If you find `[send] via NET/Socket` in the logs, it means NCCL uses raw TCP Socket, which is not efficient for cross-node tensor parallel. If you find `[send] via NET/IB/GDRDMA` in the logs, it means NCCL uses Infiniband with GPU-Direct RDMA, which is efficient.
 
-```{warning}
+:::{warning}
 After you start the Ray cluster, you'd better also check the GPU-GPU communication between nodes. It can be non-trivial to set up. Please refer to the [sanity check script](#troubleshooting-incorrect-hardware-driver) for more information. If you need to set some environment variables for the communication configuration, you can append them to the `run_cluster.sh` script, e.g. `-e NCCL_SOCKET_IFNAME=eth0`. Note that setting environment variables in the shell (e.g. `NCCL_SOCKET_IFNAME=eth0 vllm serve ...`) only works for the processes in the same node, not for the processes in the other nodes. Setting environment variables when you create the cluster is the recommended way. See <gh-issue:6803> for more information.
-```
+:::
 
-```{warning}
+:::{warning}
 Please make sure you downloaded the model to all the nodes (with the same path), or the model is downloaded to some distributed file system that is accessible by all nodes.
 
 When you use huggingface repo id to refer to the model, you should append your huggingface token to the `run_cluster.sh` script, e.g. `-e HF_TOKEN=`. The recommended way is to download the model first, and then use the path to refer to the model.
-```
+:::
+
+:::{warning}
+If you keep receiving the error message `Error: No available node types can fulfill resource request` but you have enough GPUs in the cluster, chances are your nodes have multiple IP addresses and vLLM cannot find the right one, especially when you are using multi-node inference. Please make sure vLLM and ray use the same IP address. You can set the `VLLM_HOST_IP` environment variable to the right IP address in the `run_cluster.sh` script (different for each node!), and check `ray status` to see the IP address used by Ray. See <gh-issue:7815> for more information.
+:::
diff --git a/docs/source/serving/engine_args.md b/docs/source/serving/engine_args.md
index cd3c6a430b7fa..827c25b50522f 100644
--- a/docs/source/serving/engine_args.md
+++ b/docs/source/serving/engine_args.md
@@ -4,6 +4,7 @@
 
 Below, you can find an explanation of every engine argument for vLLM:
 
+<!--- pyml disable-num-lines 7 no-space-in-emphasis-->
 ```{eval-rst}
 .. argparse::
     :module: vllm.engine.arg_utils
@@ -16,6 +17,7 @@ Below, you can find an explanation of every engine argument for vLLM:
 
 Below are the additional arguments related to the asynchronous engine:
 
+<!--- pyml disable-num-lines 7 no-space-in-emphasis-->
 ```{eval-rst}
 .. argparse::
     :module: vllm.engine.arg_utils
diff --git a/docs/source/serving/env_vars.md b/docs/source/serving/env_vars.md
index f9b08077a03b4..9845241930a40 100644
--- a/docs/source/serving/env_vars.md
+++ b/docs/source/serving/env_vars.md
@@ -2,14 +2,14 @@
 
 vLLM uses the following environment variables to configure the system:
 
-```{warning}
+:::{warning}
 Please note that `VLLM_PORT` and `VLLM_HOST_IP` set the port and ip for vLLM's **internal usage**. It is not the port and ip for the API server. If you use `--host $VLLM_HOST_IP` and `--port $VLLM_PORT` to start the API server, it will not work.
 
 All environment variables used by vLLM are prefixed with `VLLM_`. **Special care should be taken for Kubernetes users**: please do not name the service as `vllm`, otherwise environment variables set by Kubernetes might conflict with vLLM's environment variables, because [Kubernetes sets environment variables for each service with the capitalized service name as the prefix](https://kubernetes.io/docs/concepts/services-networking/service/#environment-variables).
-```
+:::
 
-```{literalinclude} ../../../vllm/envs.py
+:::{literalinclude} ../../../vllm/envs.py
 :end-before: end-env-vars-definition
 :language: python
 :start-after: begin-env-vars-definition
-```
+:::
diff --git a/docs/source/serving/integrations/index.md b/docs/source/serving/integrations/index.md
index 371c284981ce9..e2b4c0814605b 100644
--- a/docs/source/serving/integrations/index.md
+++ b/docs/source/serving/integrations/index.md
@@ -1,8 +1,8 @@
 # External Integrations
 
-```{toctree}
+:::{toctree}
 :maxdepth: 1
 
 langchain
 llamaindex
-```
+:::
diff --git a/docs/source/serving/metrics.md b/docs/source/serving/metrics.md
index 6c84f6d1350a6..6c0dc8880a90d 100644
--- a/docs/source/serving/metrics.md
+++ b/docs/source/serving/metrics.md
@@ -31,8 +31,8 @@ vllm:iteration_tokens_total_bucket{le="512.0",model_name="unsloth/Llama-3.2-1B-I
 
 The following metrics are exposed:
 
-```{literalinclude} ../../../vllm/engine/metrics.py
+:::{literalinclude} ../../../vllm/engine/metrics.py
 :end-before: end-metrics-definitions
 :language: python
 :start-after: begin-metrics-definitions
-```
+:::
diff --git a/docs/source/serving/multimodal_inputs.md b/docs/source/serving/multimodal_inputs.md
index 0213b0a3388ea..217b531e83788 100644
--- a/docs/source/serving/multimodal_inputs.md
+++ b/docs/source/serving/multimodal_inputs.md
@@ -4,10 +4,10 @@
 
 This page teaches you how to pass multi-modal inputs to [multi-modal models](#supported-mm-models) in vLLM.
 
-```{note}
+:::{note}
 We are actively iterating on multi-modal support. See [this RFC](gh-issue:4194) for upcoming changes,
 and [open an issue on GitHub](https://github.com/vllm-project/vllm/issues/new/choose) if you have any feedback or feature requests.
-```
+:::
 
 ## Offline Inference
 
@@ -203,13 +203,13 @@ for o in outputs:
 
 Our OpenAI-compatible server accepts multi-modal data via the [Chat Completions API](https://platform.openai.com/docs/api-reference/chat).
 
-```{important}
+:::{important}
 A chat template is **required** to use Chat Completions API.
 
 Although most models come with a chat template, for others you have to define one yourself.
 The chat template can be inferred based on the documentation on the model's HuggingFace repo.
 For example, LLaVA-1.5 (`llava-hf/llava-1.5-7b-hf`) requires a chat template that can be found here: <gh-file:examples/template_llava.jinja>
-```
+:::
 
 ### Image
 
@@ -273,24 +273,25 @@ print("Chat completion output:", chat_response.choices[0].message.content)
 
 Full example: <gh-file:examples/online_serving/openai_chat_completion_client_for_multimodal.py>
 
-```{tip}
+:::{tip}
 Loading from local file paths is also supported on vLLM: You can specify the allowed local media path via `--allowed-local-media-path` when launching the API server/engine,
 and pass the file path as `url` in the API request.
-```
+:::
 
-```{tip}
+:::{tip}
 There is no need to place image placeholders in the text content of the API request - they are already represented by the image content.
 In fact, you can place image placeholders in the middle of the text by interleaving text and image content.
-```
+:::
 
-````{note}
+:::{note}
 By default, the timeout for fetching images through HTTP URL is `5` seconds.
 You can override this by setting the environment variable:
 
 ```console
-$ export VLLM_IMAGE_FETCH_TIMEOUT=<timeout>
+export VLLM_IMAGE_FETCH_TIMEOUT=<timeout>
 ```
-````
+
+:::
 
 ### Video
 
@@ -345,14 +346,15 @@ print("Chat completion output from image url:", result)
 
 Full example: <gh-file:examples/online_serving/openai_chat_completion_client_for_multimodal.py>
 
-````{note}
+:::{note}
 By default, the timeout for fetching videos through HTTP URL is `30` seconds.
 You can override this by setting the environment variable:
 
 ```console
-$ export VLLM_VIDEO_FETCH_TIMEOUT=<timeout>
+export VLLM_VIDEO_FETCH_TIMEOUT=<timeout>
 ```
-````
+
+:::
 
 ### Audio
 
@@ -448,24 +450,25 @@ print("Chat completion output from audio url:", result)
 
 Full example: <gh-file:examples/online_serving/openai_chat_completion_client_for_multimodal.py>
 
-````{note}
+:::{note}
 By default, the timeout for fetching audios through HTTP URL is `10` seconds.
 You can override this by setting the environment variable:
 
 ```console
-$ export VLLM_AUDIO_FETCH_TIMEOUT=<timeout>
+export VLLM_AUDIO_FETCH_TIMEOUT=<timeout>
 ```
-````
+
+:::
 
 ### Embedding
 
 vLLM's Embeddings API is a superset of OpenAI's [Embeddings API](https://platform.openai.com/docs/api-reference/embeddings),
 where a list of chat `messages` can be passed instead of batched `inputs`. This enables multi-modal inputs to be passed to embedding models.
 
-```{tip}
+:::{tip}
 The schema of `messages` is exactly the same as in Chat Completions API.
 You can refer to the above tutorials for more details on how to pass each type of multi-modal data.
-```
+:::
 
 Usually, embedding models do not expect chat-based input, so we need to use a custom chat template to format the text and images.
 Refer to the examples below for illustration.
@@ -477,13 +480,13 @@ vllm serve TIGER-Lab/VLM2Vec-Full --task embed \
   --trust-remote-code --max-model-len 4096 --chat-template examples/template_vlm2vec.jinja
 ```
 
-```{important}
+:::{important}
 Since VLM2Vec has the same model architecture as Phi-3.5-Vision, we have to explicitly pass `--task embed`
 to run this model in embedding mode instead of text generation mode.
 
 The custom chat template is completely different from the original one for this model,
 and can be found here: <gh-file:examples/template_vlm2vec.jinja>
-```
+:::
 
 Since the request schema is not defined by OpenAI client, we post a request to the server using the lower-level `requests` library:
 
@@ -518,16 +521,16 @@ vllm serve MrLight/dse-qwen2-2b-mrl-v1 --task embed \
   --trust-remote-code --max-model-len 8192 --chat-template examples/template_dse_qwen2_vl.jinja
 ```
 
-```{important}
+:::{important}
 Like with VLM2Vec, we have to explicitly pass `--task embed`.
 
 Additionally, `MrLight/dse-qwen2-2b-mrl-v1` requires an EOS token for embeddings, which is handled
 by a custom chat template: <gh-file:examples/template_dse_qwen2_vl.jinja>
-```
+:::
 
-```{important}
+:::{important}
 Also important, `MrLight/dse-qwen2-2b-mrl-v1` requires a placeholder image of the minimum image size for text query embeddings. See the full code
 example below for details.
-```
+:::
 
 Full example: <gh-file:examples/online_serving/openai_chat_embedding_client_for_multimodal.py>
diff --git a/docs/source/serving/offline_inference.md b/docs/source/serving/offline_inference.md
index 8a18598665a70..ded57500c5d0d 100644
--- a/docs/source/serving/offline_inference.md
+++ b/docs/source/serving/offline_inference.md
@@ -22,9 +22,9 @@ The available APIs depend on the type of model that is being run:
 
 Please refer to the above pages for more details about each API.
 
-```{seealso}
+:::{seealso}
 [API Reference](/api/offline_inference/index)
-```
+:::
 
 ## Configuration Options
 
@@ -70,12 +70,12 @@ llm = LLM(model="ibm-granite/granite-3.1-8b-instruct",
           tensor_parallel_size=2)
 ```
 
-```{important}
+:::{important}
 To ensure that vLLM initializes CUDA correctly, you should avoid calling related functions (e.g. {func}`torch.cuda.set_device`)
 before initializing vLLM. Otherwise, you may run into an error like `RuntimeError: Cannot re-initialize CUDA in forked subprocess`.
 
 To control which devices are used, please instead set the `CUDA_VISIBLE_DEVICES` environment variable.
-```
+:::
 
 #### Quantization
 
diff --git a/docs/source/serving/openai_compatible_server.md b/docs/source/serving/openai_compatible_server.md
index e49bbb06695f8..82ef54c16dafb 100644
--- a/docs/source/serving/openai_compatible_server.md
+++ b/docs/source/serving/openai_compatible_server.md
@@ -50,6 +50,11 @@ In addition, we have the following custom APIs:
   - Applicable to all [pooling models](../models/pooling_models.md).
 - [Score API](#score-api) (`/score`)
   - Only applicable to [cross-encoder models](../models/pooling_models.md) (`--task score`).
+- [Re-rank API](#rerank-api) (`/rerank`, `/v1/rerank`, `/v2/rerank`)
+  - Implements [Jina AI's v1 re-rank API](https://jina.ai/reranker/)
+  - Also compatible with [Cohere's v1 & v2 re-rank APIs](https://docs.cohere.com/v2/reference/rerank)
+  - Jina and Cohere's APIs are very similar; Jina's includes extra information in the rerank endpoint's response.
+  - Only applicable to [cross-encoder models](../models/pooling_models.md) (`--task score`).
 
 (chat-template)=
 
@@ -156,11 +161,11 @@ print(completion._request_id)
 
 The `vllm serve` command is used to launch the OpenAI-compatible server.
 
-```{argparse}
+:::{argparse}
 :module: vllm.entrypoints.openai.cli_args
 :func: create_parser_for_docs
 :prog: vllm serve
-```
+:::
 
 #### Configuration file
 
@@ -183,10 +188,10 @@ To use the above config file:
 vllm serve SOME_MODEL --config config.yaml
 ```
 
-```{note}
+:::{note}
 In case an argument is supplied simultaneously using command line and the config file, the value from the command line will take precedence.
 The order of priorities is `command line > config file values > defaults`.
-```
+:::
 
 ## API Reference
 
@@ -203,19 +208,19 @@ Code example: <gh-file:examples/online_serving/openai_completion_client.py>
 
 The following [sampling parameters](#sampling-params) are supported.
 
-```{literalinclude} ../../../vllm/entrypoints/openai/protocol.py
+:::{literalinclude} ../../../vllm/entrypoints/openai/protocol.py
 :language: python
 :start-after: begin-completion-sampling-params
 :end-before: end-completion-sampling-params
-```
+:::
 
 The following extra parameters are supported:
 
-```{literalinclude} ../../../vllm/entrypoints/openai/protocol.py
+:::{literalinclude} ../../../vllm/entrypoints/openai/protocol.py
 :language: python
 :start-after: begin-completion-extra-params
 :end-before: end-completion-extra-params
-```
+:::
 
 (chat-api)=
 
@@ -235,19 +240,19 @@ Code example: <gh-file:examples/online_serving/openai_chat_completion_client.py>
 
 The following [sampling parameters](#sampling-params) are supported.
 
-```{literalinclude} ../../../vllm/entrypoints/openai/protocol.py
+:::{literalinclude} ../../../vllm/entrypoints/openai/protocol.py
 :language: python
 :start-after: begin-chat-completion-sampling-params
 :end-before: end-chat-completion-sampling-params
-```
+:::
 
 The following extra parameters are supported:
 
-```{literalinclude} ../../../vllm/entrypoints/openai/protocol.py
+:::{literalinclude} ../../../vllm/entrypoints/openai/protocol.py
 :language: python
 :start-after: begin-chat-completion-extra-params
 :end-before: end-chat-completion-extra-params
-```
+:::
 
 (embeddings-api)=
 
@@ -259,9 +264,9 @@ you can use the [official OpenAI Python client](https://github.com/openai/openai
 If the model has a [chat template](#chat-template), you can replace `inputs` with a list of `messages` (same schema as [Chat API](#chat-api))
 which will be treated as a single prompt to the model.
 
-```{tip}
+:::{tip}
 This enables multi-modal inputs to be passed to embedding models, see [this page](#multimodal-inputs) for details.
-```
+:::
 
 Code example: <gh-file:examples/online_serving/openai_embedding_client.py>
 
@@ -269,27 +274,27 @@ Code example: <gh-file:examples/online_serving/openai_embedding_client.py>
 
 The following [pooling parameters](#pooling-params) are supported.
 
-```{literalinclude} ../../../vllm/entrypoints/openai/protocol.py
+:::{literalinclude} ../../../vllm/entrypoints/openai/protocol.py
 :language: python
 :start-after: begin-embedding-pooling-params
 :end-before: end-embedding-pooling-params
-```
+:::
 
 The following extra parameters are supported by default:
 
-```{literalinclude} ../../../vllm/entrypoints/openai/protocol.py
+:::{literalinclude} ../../../vllm/entrypoints/openai/protocol.py
 :language: python
 :start-after: begin-embedding-extra-params
 :end-before: end-embedding-extra-params
-```
+:::
 
 For chat-like input (i.e. if `messages` is passed), these extra parameters are supported instead:
 
-```{literalinclude} ../../../vllm/entrypoints/openai/protocol.py
+:::{literalinclude} ../../../vllm/entrypoints/openai/protocol.py
 :language: python
 :start-after: begin-chat-embedding-extra-params
 :end-before: end-chat-embedding-extra-params
-```
+:::
 
 (tokenizer-api)=
 
@@ -460,16 +465,103 @@ Response:
 
 The following [pooling parameters](#pooling-params) are supported.
 
-```{literalinclude} ../../../vllm/entrypoints/openai/protocol.py
+:::{literalinclude} ../../../vllm/entrypoints/openai/protocol.py
 :language: python
 :start-after: begin-score-pooling-params
 :end-before: end-score-pooling-params
-```
+:::
 
 The following extra parameters are supported:
 
-```{literalinclude} ../../../vllm/entrypoints/openai/protocol.py
+:::{literalinclude} ../../../vllm/entrypoints/openai/protocol.py
 :language: python
 :start-after: begin-score-extra-params
 :end-before: end-score-extra-params
+:::
+
+(rerank-api)=
+
+### Re-rank API
+
+Our Re-rank API applies a cross-encoder model to predict relevant scores between a single query, and
+each of a list of documents. Usually, the score for a sentence pair refers to the similarity between two sentences, on
+a scale of 0 to 1.
+
+You can find the documentation for these kind of models at [sbert.net](https://www.sbert.net/docs/package_reference/cross_encoder/cross_encoder.html).
+
+The rerank endpoints support popular re-rank models such as `BAAI/bge-reranker-base` and other models supporting the
+`score` task. Additionally, `/rerank`, `/v1/rerank`, and `/v2/rerank`
+endpoints are compatible with both [Jina AI's re-rank API interface](https://jina.ai/reranker/) and
+[Cohere's re-rank API interface](https://docs.cohere.com/v2/reference/rerank) to ensure compatibility with
+popular open-source tools.
+
+Code example: <gh-file:examples/online_serving/jinaai_rerank_client.py>
+
+#### Example Request
+
+Note that the `top_n` request parameter is optional and will default to the length of the `documents` field.
+Result documents will be sorted by relevance, and the `index` property can be used to determine original order.
+
+Request:
+
+```bash
+curl -X 'POST' \
+  'http://127.0.0.1:8000/v1/rerank' \
+  -H 'accept: application/json' \
+  -H 'Content-Type: application/json' \
+  -d '{
+  "model": "BAAI/bge-reranker-base",
+  "query": "What is the capital of France?",
+  "documents": [
+    "The capital of Brazil is Brasilia.",
+    "The capital of France is Paris.",
+    "Horses and cows are both animals"
+  ]
+}'
 ```
+
+Response:
+
+```bash
+{
+  "id": "rerank-fae51b2b664d4ed38f5969b612edff77",
+  "model": "BAAI/bge-reranker-base",
+  "usage": {
+    "total_tokens": 56
+  },
+  "results": [
+    {
+      "index": 1,
+      "document": {
+        "text": "The capital of France is Paris."
+      },
+      "relevance_score": 0.99853515625
+    },
+    {
+      "index": 0,
+      "document": {
+        "text": "The capital of Brazil is Brasilia."
+      },
+      "relevance_score": 0.0005860328674316406
+    }
+  ]
+}
+```
+
+#### Extra parameters
+
+The following [pooling parameters](#pooling-params) are supported.
+
+:::{literalinclude} ../../../vllm/entrypoints/openai/protocol.py
+:language: python
+:start-after: begin-rerank-pooling-params
+:end-before: end-rerank-pooling-params
+:::
+
+The following extra parameters are supported:
+
+:::{literalinclude} ../../../vllm/entrypoints/openai/protocol.py
+:language: python
+:start-after: begin-rerank-extra-params
+:end-before: end-rerank-extra-params
+:::
diff --git a/examples/offline_inference/aqlm_example.py b/examples/offline_inference/aqlm_example.py
index 40f9a21ec9e51..e8db3811ff171 100644
--- a/examples/offline_inference/aqlm_example.py
+++ b/examples/offline_inference/aqlm_example.py
@@ -1,3 +1,5 @@
+# SPDX-License-Identifier: Apache-2.0
+
 from vllm import LLM, SamplingParams
 from vllm.utils import FlexibleArgumentParser
 
diff --git a/examples/offline_inference/arctic.py b/examples/offline_inference/arctic.py
index 1fec3c99eb47c..90c88446c5146 100644
--- a/examples/offline_inference/arctic.py
+++ b/examples/offline_inference/arctic.py
@@ -1,3 +1,5 @@
+# SPDX-License-Identifier: Apache-2.0
+
 from vllm import LLM, SamplingParams
 
 # Sample prompts.
diff --git a/examples/offline_inference/audio_language.py b/examples/offline_inference/audio_language.py
index 6fd74782a9aae..707ca9f878961 100644
--- a/examples/offline_inference/audio_language.py
+++ b/examples/offline_inference/audio_language.py
@@ -1,3 +1,4 @@
+# SPDX-License-Identifier: Apache-2.0
 """
 This example shows how to use vLLM for running offline inference 
 with the correct prompt format on audio language models.
@@ -67,7 +68,37 @@ def run_qwen2_audio(question: str, audio_count: int):
     return llm, prompt, stop_token_ids
 
 
-model_example_map = {"ultravox": run_ultravox, "qwen2_audio": run_qwen2_audio}
+def run_minicpmo(question: str, audio_count: int):
+    model_name = "openbmb/MiniCPM-o-2_6"
+    tokenizer = AutoTokenizer.from_pretrained(model_name,
+                                              trust_remote_code=True)
+    llm = LLM(model=model_name,
+              trust_remote_code=True,
+              max_model_len=4096,
+              max_num_seqs=5,
+              limit_mm_per_prompt={"audio": audio_count})
+
+    stop_tokens = ['<|im_end|>', '<|endoftext|>']
+    stop_token_ids = [tokenizer.convert_tokens_to_ids(i) for i in stop_tokens]
+
+    audio_placeholder = "(<audio>./</audio>)" * audio_count
+    audio_chat_template = "{% for message in messages %}{{'<|im_start|>' + message['role'] + '\n' + message['content'] + '<|im_end|>' + '\n'}}{% endfor %}{% if add_generation_prompt %}{{ '<|im_start|>assistant\n<|spk_bos|><|spk|><|spk_eos|><|tts_bos|>' }}{% endif %}"  # noqa: E501
+    messages = [{
+        'role': 'user',
+        'content': f'{audio_placeholder}\n{question}'
+    }]
+    prompt = tokenizer.apply_chat_template(messages,
+                                           tokenize=False,
+                                           add_generation_prompt=True,
+                                           chat_template=audio_chat_template)
+    return llm, prompt, stop_token_ids
+
+
+model_example_map = {
+    "ultravox": run_ultravox,
+    "qwen2_audio": run_qwen2_audio,
+    "minicpmo": run_minicpmo
+}
 
 
 def main(args):
diff --git a/examples/offline_inference/basic.py b/examples/offline_inference/basic.py
index 23cc6e8539431..a6e96c0bb4339 100644
--- a/examples/offline_inference/basic.py
+++ b/examples/offline_inference/basic.py
@@ -1,3 +1,5 @@
+# SPDX-License-Identifier: Apache-2.0
+
 from vllm import LLM, SamplingParams
 
 # Sample prompts.
diff --git a/examples/offline_inference/basic_with_model_default_sampling.py b/examples/offline_inference/basic_with_model_default_sampling.py
index 346bb80b1e23f..80de9428f6a9a 100644
--- a/examples/offline_inference/basic_with_model_default_sampling.py
+++ b/examples/offline_inference/basic_with_model_default_sampling.py
@@ -1,3 +1,5 @@
+# SPDX-License-Identifier: Apache-2.0
+
 from vllm import LLM
 
 # Sample prompts.
diff --git a/examples/offline_inference/chat.py b/examples/offline_inference/chat.py
index 8814f4d7bef0d..dbc710cc8a0b7 100644
--- a/examples/offline_inference/chat.py
+++ b/examples/offline_inference/chat.py
@@ -1,3 +1,5 @@
+# SPDX-License-Identifier: Apache-2.0
+
 from vllm import LLM, SamplingParams
 
 llm = LLM(model="meta-llama/Meta-Llama-3-8B-Instruct")
diff --git a/examples/offline_inference/chat_with_tools.py b/examples/offline_inference/chat_with_tools.py
index e69a6c067e4da..15519bfed9cb4 100644
--- a/examples/offline_inference/chat_with_tools.py
+++ b/examples/offline_inference/chat_with_tools.py
@@ -1,3 +1,5 @@
+# SPDX-License-Identifier: Apache-2.0
+
 # ruff: noqa
 import json
 import random
diff --git a/examples/offline_inference/classification.py b/examples/offline_inference/classification.py
index de539b639a196..4a364aeb8c47b 100644
--- a/examples/offline_inference/classification.py
+++ b/examples/offline_inference/classification.py
@@ -1,3 +1,5 @@
+# SPDX-License-Identifier: Apache-2.0
+
 from vllm import LLM
 
 # Sample prompts.
diff --git a/examples/offline_inference/cli.py b/examples/offline_inference/cli.py
index 391ac6b9b6b03..bc6833b3f39c5 100644
--- a/examples/offline_inference/cli.py
+++ b/examples/offline_inference/cli.py
@@ -1,3 +1,5 @@
+# SPDX-License-Identifier: Apache-2.0
+
 from dataclasses import asdict
 
 from vllm import LLM, SamplingParams
diff --git a/examples/offline_inference/cpu_offload.py b/examples/offline_inference/cpu_offload.py
index b152e5bc37e6d..5511eb738778a 100644
--- a/examples/offline_inference/cpu_offload.py
+++ b/examples/offline_inference/cpu_offload.py
@@ -1,3 +1,5 @@
+# SPDX-License-Identifier: Apache-2.0
+
 from vllm import LLM, SamplingParams
 
 # Sample prompts.
diff --git a/examples/offline_inference/distributed.py b/examples/offline_inference/distributed.py
index 677127844ccdd..a2df41d4ce21b 100644
--- a/examples/offline_inference/distributed.py
+++ b/examples/offline_inference/distributed.py
@@ -1,3 +1,4 @@
+# SPDX-License-Identifier: Apache-2.0
 """
 This example shows how to use Ray Data for running offline batch inference
 distributively on a multi-nodes cluster.
diff --git a/examples/offline_inference/embedding.py b/examples/offline_inference/embedding.py
index 58d004313ad51..f9399329d24f3 100644
--- a/examples/offline_inference/embedding.py
+++ b/examples/offline_inference/embedding.py
@@ -1,3 +1,5 @@
+# SPDX-License-Identifier: Apache-2.0
+
 from vllm import LLM
 
 # Sample prompts.
diff --git a/examples/offline_inference/encoder_decoder.py b/examples/offline_inference/encoder_decoder.py
index 0f266d7918853..8765d1812cc53 100644
--- a/examples/offline_inference/encoder_decoder.py
+++ b/examples/offline_inference/encoder_decoder.py
@@ -1,3 +1,4 @@
+# SPDX-License-Identifier: Apache-2.0
 '''
 Demonstrate prompting of text-to-text
 encoder/decoder models, specifically BART
diff --git a/examples/offline_inference/florence2_inference.py b/examples/offline_inference/florence2_inference.py
index c24096e90004b..58610b0fd2a51 100644
--- a/examples/offline_inference/florence2_inference.py
+++ b/examples/offline_inference/florence2_inference.py
@@ -1,3 +1,4 @@
+# SPDX-License-Identifier: Apache-2.0
 '''
 Demonstrate prompting of text-to-text
 encoder/decoder models, specifically Florence-2
diff --git a/examples/offline_inference/gguf_inference.py b/examples/offline_inference/gguf_inference.py
index aa05c4c0bfaa5..0447e74e0d6f6 100644
--- a/examples/offline_inference/gguf_inference.py
+++ b/examples/offline_inference/gguf_inference.py
@@ -1,3 +1,5 @@
+# SPDX-License-Identifier: Apache-2.0
+
 from huggingface_hub import hf_hub_download
 
 from vllm import LLM, SamplingParams
diff --git a/examples/offline_inference/llm_engine_example.py b/examples/offline_inference/llm_engine_example.py
index 60d894aae9692..501034c1cc5da 100644
--- a/examples/offline_inference/llm_engine_example.py
+++ b/examples/offline_inference/llm_engine_example.py
@@ -1,3 +1,5 @@
+# SPDX-License-Identifier: Apache-2.0
+
 import argparse
 from typing import List, Tuple
 
diff --git a/examples/offline_inference/lora_with_quantization_inference.py b/examples/offline_inference/lora_with_quantization_inference.py
index 0c454ea50f665..de0734c1aa83b 100644
--- a/examples/offline_inference/lora_with_quantization_inference.py
+++ b/examples/offline_inference/lora_with_quantization_inference.py
@@ -1,3 +1,4 @@
+# SPDX-License-Identifier: Apache-2.0
 """
 This example shows how to use LoRA with different quantization techniques
 for offline inference.
diff --git a/examples/offline_inference/mlpspeculator.py b/examples/offline_inference/mlpspeculator.py
index 8f0eb65e47f6a..10d9de8cb0de1 100644
--- a/examples/offline_inference/mlpspeculator.py
+++ b/examples/offline_inference/mlpspeculator.py
@@ -1,3 +1,5 @@
+# SPDX-License-Identifier: Apache-2.0
+
 import gc
 import time
 from typing import List
diff --git a/examples/offline_inference/multilora_inference.py b/examples/offline_inference/multilora_inference.py
index 043220d979c3c..630fd1bf83420 100644
--- a/examples/offline_inference/multilora_inference.py
+++ b/examples/offline_inference/multilora_inference.py
@@ -1,3 +1,4 @@
+# SPDX-License-Identifier: Apache-2.0
 """
 This example shows how to use the multi-LoRA functionality
 for offline inference.
diff --git a/examples/offline_inference/neuron.py b/examples/offline_inference/neuron.py
index f098c8e5fed1e..517d1bfce95d8 100644
--- a/examples/offline_inference/neuron.py
+++ b/examples/offline_inference/neuron.py
@@ -1,3 +1,5 @@
+# SPDX-License-Identifier: Apache-2.0
+
 from vllm import LLM, SamplingParams
 
 # Sample prompts.
diff --git a/examples/offline_inference/neuron_int8_quantization.py b/examples/offline_inference/neuron_int8_quantization.py
index 8ec17e3400953..c899a01a0bb93 100644
--- a/examples/offline_inference/neuron_int8_quantization.py
+++ b/examples/offline_inference/neuron_int8_quantization.py
@@ -1,3 +1,5 @@
+# SPDX-License-Identifier: Apache-2.0
+
 import os
 
 from vllm import LLM, SamplingParams
diff --git a/examples/offline_inference/openai/openai_batch.md b/examples/offline_inference/openai/openai_batch.md
index a4774e57cd9a5..953e6ef130f18 100644
--- a/examples/offline_inference/openai/openai_batch.md
+++ b/examples/offline_inference/openai/openai_batch.md
@@ -13,7 +13,7 @@ The OpenAI batch file format consists of a series of json objects on new lines.
 Each line represents a separate request. See the [OpenAI package reference](https://platform.openai.com/docs/api-reference/batch/requestInput) for more details.
  
 ```{note}
-We currently only support `/v1/chat/completions` and `/v1/embeddings` endpoints (completions coming soon).
+We currently support `/v1/chat/completions`, `/v1/embeddings`, and `/v1/score` endpoints (completions coming soon).
 ```
  
 ## Pre-requisites
@@ -203,3 +203,34 @@ $ cat results.jsonl
 {"id":"vllm-db0f71f7dec244e6bce530e0b4ef908b","custom_id":"request-1","response":{"status_code":200,"request_id":"vllm-batch-3580bf4d4ae54d52b67eee266a6eab20","body":{"id":"embd-33ac2efa7996430184461f2e38529746","object":"list","created":444647,"model":"intfloat/e5-mistral-7b-instruct","data":[{"index":0,"object":"embedding","embedding":[0.016204833984375,0.0092010498046875,0.0018358230590820312,-0.0028228759765625,0.001422882080078125,-0.0031147003173828125,...]}],"usage":{"prompt_tokens":8,"total_tokens":8,"completion_tokens":0}}},"error":null}
 ...
 ```
+
+## Example 5: Using score endpoint
+
+### Additional prerequisites
+
+* Ensure you are using `vllm >= 0.7.0`.
+
+### Step 1: Create your batch file
+ 
+Add score requests to your batch file. The following is an example:
+ 
+```
+{"custom_id": "request-1", "method": "POST", "url": "/v1/score", "body": {"model": "BAAI/bge-reranker-v2-m3", "text_1": "What is the capital of France?", "text_2": ["The capital of Brazil is Brasilia.", "The capital of France is Paris."]}}
+{"custom_id": "request-2", "method": "POST", "url": "/v1/score", "body": {"model": "BAAI/bge-reranker-v2-m3", "text_1": "What is the capital of France?", "text_2": ["The capital of Brazil is Brasilia.", "The capital of France is Paris."]}}
+```
+
+You can mix chat completion, embedding, and score requests in the batch file, as long as the model you are using supports them all (note that all requests must use the same model).
+
+### Step 2: Run the batch
+
+You can run the batch using the same command as in earlier examples.
+
+### Step 3: Check your results
+
+You can check your results by running `cat results.jsonl`
+
+```
+$ cat results.jsonl
+{"id":"vllm-f87c5c4539184f618e555744a2965987","custom_id":"request-1","response":{"status_code":200,"request_id":"vllm-batch-806ab64512e44071b37d3f7ccd291413","body":{"id":"score-4ee45236897b4d29907d49b01298cdb1","object":"list","created":1737847944,"model":"BAAI/bge-reranker-v2-m3","data":[{"index":0,"object":"score","score":0.0010900497436523438},{"index":1,"object":"score","score":1.0}],"usage":{"prompt_tokens":37,"total_tokens":37,"completion_tokens":0,"prompt_tokens_details":null}}},"error":null}
+{"id":"vllm-41990c51a26d4fac8419077f12871099","custom_id":"request-2","response":{"status_code":200,"request_id":"vllm-batch-73ce66379026482699f81974e14e1e99","body":{"id":"score-13f2ffe6ba40460fbf9f7f00ad667d75","object":"list","created":1737847944,"model":"BAAI/bge-reranker-v2-m3","data":[{"index":0,"object":"score","score":0.001094818115234375},{"index":1,"object":"score","score":1.0}],"usage":{"prompt_tokens":37,"total_tokens":37,"completion_tokens":0,"prompt_tokens_details":null}}},"error":null}
+```
diff --git a/examples/offline_inference/pixtral.py b/examples/offline_inference/pixtral.py
index c12ff7021cf51..760de114508cd 100644
--- a/examples/offline_inference/pixtral.py
+++ b/examples/offline_inference/pixtral.py
@@ -1,3 +1,5 @@
+# SPDX-License-Identifier: Apache-2.0
+
 # ruff: noqa
 import argparse
 
diff --git a/examples/offline_inference/prefix_caching.py b/examples/offline_inference/prefix_caching.py
index 67b755a155966..4c326c417b4db 100644
--- a/examples/offline_inference/prefix_caching.py
+++ b/examples/offline_inference/prefix_caching.py
@@ -1,3 +1,5 @@
+# SPDX-License-Identifier: Apache-2.0
+
 from vllm import LLM, SamplingParams
 from vllm.distributed import cleanup_dist_env_and_memory
 
diff --git a/examples/offline_inference/profiling.py b/examples/offline_inference/profiling.py
index 8a94b5c2a8623..c2e072fdd8889 100644
--- a/examples/offline_inference/profiling.py
+++ b/examples/offline_inference/profiling.py
@@ -1,3 +1,5 @@
+# SPDX-License-Identifier: Apache-2.0
+
 import inspect
 import json
 import os
diff --git a/examples/offline_inference/profiling_tpu/README.md b/examples/offline_inference/profiling_tpu/README.md
new file mode 100644
index 0000000000000..08efa63dc1021
--- /dev/null
+++ b/examples/offline_inference/profiling_tpu/README.md
@@ -0,0 +1,67 @@
+# vLLM TPU Profiling
+
+This script is used to profile the TPU performance of vLLM for specific prefill or decode token shapes.
+
+Note: an actual running server is a mix of both prefill of many shapes and decode of many shapes.
+
+We assume you are on a TPU already (this was tested on TPU v6e) and have installed vLLM according to the [installation guide](https://docs.vllm.ai/en/latest/getting_started/installation/ai_accelerator/index.html).
+
+> In all examples below, we run several warmups before (so `--enforce-eager` is okay)
+
+## Profile Examples
+
+### Generate Prefill Trace
+
+This example runs Qwen/Qwen2.5-7B-Instruct with a single request of 1024 input tokens. This is set up in attempt to profile just the prefill time and operations.
+
+```bash
+export XLA_HLO_DEBUG=1
+export MODEL=Qwen/Qwen2.5-7B-Instruct
+export VLLM_TPU_PROFILE_DURATION_MS=3000
+export VLLM_TPU_PROFILE_DELAY_MS=0
+
+python3 profiling.py \
+    --model $MODEL \
+    --input-len 1024 --output-len 1 \
+    --batch-size 1 --enforce-eager \
+    --max-model-len 2048 \
+    --tensor-parallel-size 1 \
+    --profile-result-dir profiles
+```
+
+
+### Generate Decode Trace
+
+This example runs Llama 3.1 70B with a batch of 32 requests where each has 1 input token and 128 output tokens. This is set up in attempt to profile just the 32 decodes running in parallel by having an extremely small prefill of 1 token and setting `VLLM_TPU_PROFILE_DELAY_MS=1000` to skip the first second of inference (hopefully prefill).
+
+```bash
+export XLA_HLO_DEBUG=1
+export MODEL=meta-llama/Llama-3.1-70B-Instruct
+export VLLM_TPU_PROFILE_DURATION_MS=2000
+export VLLM_TPU_PROFILE_DELAY_MS=1000
+
+rm -rf ~/.cache/vllm/xla_cache
+python3 profiling.py \
+    --model $MODEL \
+    --input-len 1 \
+    --output-len 128 \
+    --batch-size 32 \
+    --enforce-eager \
+    --profile-result-dir profiles \
+    --max-model-len 2048 --tensor-parallel-size 8
+```
+
+
+## Visualizing the profiles
+
+Once you have collected your profiles with this script, you can visualize them using [TensorBoard](https://cloud.google.com/tpu/docs/pytorch-xla-performance-profiling-tpu-vm).
+
+Here are most likely the dependencies you need to install:
+```bash
+pip install tensorflow-cpu tensorboard-plugin-profile etils importlib_resources
+```
+
+Then you just need to point TensorBoard to the directory where you saved the profiles and visit `http://localhost:6006/` in your browser:
+```bash
+tensorboard --logdir profiles/ --port 6006
+```
\ No newline at end of file
diff --git a/examples/offline_inference/profiling_tpu/profiling.py b/examples/offline_inference/profiling_tpu/profiling.py
new file mode 100644
index 0000000000000..b1fe829b3c380
--- /dev/null
+++ b/examples/offline_inference/profiling_tpu/profiling.py
@@ -0,0 +1,103 @@
+# SPDX-License-Identifier: Apache-2.0
+
+import argparse
+import dataclasses
+import os
+import time
+from typing import List
+
+import numpy as np
+import torch_xla.debug.profiler as xp
+from tqdm import tqdm
+
+from vllm import LLM, SamplingParams
+from vllm.engine.arg_utils import EngineArgs
+from vllm.inputs import PromptType
+from vllm.utils import FlexibleArgumentParser
+
+DURATION_MS = int(os.getenv("VLLM_TPU_PROFILE_DURATION_MS", 3000))
+DELAY_MS = int(os.getenv("VLLM_TPU_PROFILE_DELAY_MS", 0))
+
+
+def main(args: argparse.Namespace):
+    print(args)
+
+    engine_args = EngineArgs.from_cli_args(args)
+    llm = LLM(**dataclasses.asdict(engine_args))
+    _ = xp.start_server(9012)
+
+    sampling_params = SamplingParams(
+        temperature=0.0,
+        ignore_eos=True,
+        max_tokens=args.output_len,
+    )
+    print(sampling_params)
+    dummy_prompt_token_ids = np.random.randint(10000,
+                                               size=(args.batch_size,
+                                                     args.input_len))
+    dummy_prompts: List[PromptType] = [{
+        "prompt_token_ids": batch
+    } for batch in dummy_prompt_token_ids.tolist()]
+
+    def run_to_completion():
+        start_time = time.perf_counter()
+        llm.generate(dummy_prompts,
+                     sampling_params=sampling_params,
+                     use_tqdm=False)
+        end_time = time.perf_counter()
+        latency = end_time - start_time
+        return latency
+
+    # Warmup
+    print("Warming up...")
+    warmup_latencies = []
+    for _ in tqdm(range(args.num_iters_warmup), desc="Warmup iterations"):
+        warmup_latencies.append(run_to_completion())
+    print(f"Average warmup latency: {np.mean(warmup_latencies):.4f}s")
+
+    # Profile
+    profile_dir = args.profile_result_dir
+    print(f"Profiling (results will be saved to '{profile_dir}')...")
+    # Enable tracing on server
+    xp.trace_detached("localhost:9012",
+                      profile_dir,
+                      delay_ms=DELAY_MS,
+                      duration_ms=DURATION_MS)
+    if DELAY_MS == 0:
+        time.sleep(1.0)
+    profile_latencies = []
+    for _ in tqdm(range(args.num_iters), desc="Profile iterations"):
+        profile_latencies.append(run_to_completion())
+    print(f"Average profile latency: {np.mean(profile_latencies):.4f}s")
+
+    return
+
+
+if __name__ == '__main__':
+    parser = FlexibleArgumentParser(
+        description='Benchmark the latency of processing a single batch of '
+        'requests till completion.')
+    parser.add_argument('--input-len', type=int, default=32)
+    parser.add_argument('--output-len', type=int, default=128)
+    parser.add_argument('--batch-size', type=int, default=8)
+    parser.add_argument('--num-iters-warmup',
+                        type=int,
+                        default=5,
+                        help='Number of iterations to run for warmup.')
+    parser.add_argument('--num-iters',
+                        type=int,
+                        default=1,
+                        help='Number of iterations to run for profiling.')
+    parser.add_argument(
+        '--profile-result-dir',
+        type=str,
+        default="profiles",
+        help=
+        ('path to save the pytorch profiler output. Can be visualized '
+         'with ui.perfetto.dev or Tensorboard '
+         '(https://cloud.google.com/tpu/docs/pytorch-xla-performance-profiling-tpu-vm).'
+         ))
+
+    parser = EngineArgs.add_cli_args(parser)
+    args = parser.parse_args()
+    main(args)
diff --git a/examples/offline_inference/rlhf.py b/examples/offline_inference/rlhf.py
index 5c4918008dcb3..5000251c099f7 100644
--- a/examples/offline_inference/rlhf.py
+++ b/examples/offline_inference/rlhf.py
@@ -1,3 +1,4 @@
+# SPDX-License-Identifier: Apache-2.0
 """
 a simple demonstration of RLHF with vLLM, inspired by
 the OpenRLHF framework https://github.com/OpenRLHF/OpenRLHF .
diff --git a/examples/offline_inference/save_sharded_state.py b/examples/offline_inference/save_sharded_state.py
index 4207f8922403b..863276432cb9c 100644
--- a/examples/offline_inference/save_sharded_state.py
+++ b/examples/offline_inference/save_sharded_state.py
@@ -1,3 +1,4 @@
+# SPDX-License-Identifier: Apache-2.0
 """
 Saves each worker's model state dict directly to a checkpoint, which enables a
 fast load path for large tensor-parallel models where each worker only needs to
diff --git a/examples/offline_inference/scoring.py b/examples/offline_inference/scoring.py
index 5da9e710959b5..7daa82b827727 100644
--- a/examples/offline_inference/scoring.py
+++ b/examples/offline_inference/scoring.py
@@ -1,3 +1,5 @@
+# SPDX-License-Identifier: Apache-2.0
+
 from vllm import LLM
 
 # Sample prompts.
diff --git a/examples/offline_inference/simple_profiling.py b/examples/offline_inference/simple_profiling.py
index abcfa8e8f2f2a..b45954b3bd54a 100644
--- a/examples/offline_inference/simple_profiling.py
+++ b/examples/offline_inference/simple_profiling.py
@@ -1,3 +1,5 @@
+# SPDX-License-Identifier: Apache-2.0
+
 import os
 import time
 
diff --git a/examples/offline_inference/structured_outputs.py b/examples/offline_inference/structured_outputs.py
index 00d864606eeff..38ffd7fb9903d 100644
--- a/examples/offline_inference/structured_outputs.py
+++ b/examples/offline_inference/structured_outputs.py
@@ -1,3 +1,5 @@
+# SPDX-License-Identifier: Apache-2.0
+
 from enum import Enum
 
 from pydantic import BaseModel
diff --git a/examples/offline_inference/torchrun_example.py b/examples/offline_inference/torchrun_example.py
index b6de73eb7266e..35df6011550f2 100644
--- a/examples/offline_inference/torchrun_example.py
+++ b/examples/offline_inference/torchrun_example.py
@@ -1,3 +1,4 @@
+# SPDX-License-Identifier: Apache-2.0
 """
 experimental support for tensor-parallel inference with torchrun,
 see https://github.com/vllm-project/vllm/issues/11400 for
diff --git a/examples/offline_inference/tpu.py b/examples/offline_inference/tpu.py
index 251629b8027ce..bd0e984627d11 100644
--- a/examples/offline_inference/tpu.py
+++ b/examples/offline_inference/tpu.py
@@ -1,3 +1,5 @@
+# SPDX-License-Identifier: Apache-2.0
+
 from vllm import LLM, SamplingParams
 
 prompts = [
diff --git a/examples/offline_inference/vision_language.py b/examples/offline_inference/vision_language.py
index 415439e88ed59..65940b6ada883 100644
--- a/examples/offline_inference/vision_language.py
+++ b/examples/offline_inference/vision_language.py
@@ -1,3 +1,4 @@
+# SPDX-License-Identifier: Apache-2.0
 """
 This example shows how to use vLLM for running offline inference with
 the correct prompt format on vision language models for text generation.
@@ -265,8 +266,9 @@ def run_mantis(question: str, modality: str):
 
 
 # MiniCPM-V
-def run_minicpmv(question: str, modality: str):
-    assert modality == "image"
+def run_minicpmv_base(question: str, modality: str, model_name):
+    assert modality in ["image", "video"]
+    # If you want to use `MiniCPM-o-2_6` with audio inputs, check `audio_language.py` # noqa
 
     # 2.0
     # The official repo doesn't work yet, so we need to use a fork for now
@@ -277,7 +279,15 @@ def run_minicpmv(question: str, modality: str):
     # model_name = "openbmb/MiniCPM-Llama3-V-2_5"
 
     # 2.6
-    model_name = "openbmb/MiniCPM-V-2_6"
+    # model_name = "openbmb/MiniCPM-V-2_6"
+    # o2.6
+
+    # modality supports
+    # 2.0: image
+    # 2.5: image
+    # 2.6: image, video
+    # o2.6: image, video, audio
+    # model_name = "openbmb/MiniCPM-o-2_6"
     tokenizer = AutoTokenizer.from_pretrained(model_name,
                                               trust_remote_code=True)
     llm = LLM(
@@ -294,13 +304,18 @@ def run_minicpmv(question: str, modality: str):
     # 2.5
     # stop_token_ids = [tokenizer.eos_id, tokenizer.eot_id]
 
-    # 2.6
+    # 2.6 / o2.6
     stop_tokens = ['<|im_end|>', '<|endoftext|>']
     stop_token_ids = [tokenizer.convert_tokens_to_ids(i) for i in stop_tokens]
 
+    modality_placeholder = {
+        "image": "(<image>./</image>)",
+        "video": "(<video>./</video>)",
+    }
+
     messages = [{
         'role': 'user',
-        'content': f'(<image>./</image>)\n{question}'
+        'content': f'{modality_placeholder[modality]}\n{question}'
     }]
     prompt = tokenizer.apply_chat_template(messages,
                                            tokenize=False,
@@ -308,6 +323,14 @@ def run_minicpmv(question: str, modality: str):
     return llm, prompt, stop_token_ids
 
 
+def run_minicpmo(question: str, modality: str):
+    return run_minicpmv_base(question, modality, "openbmb/MiniCPM-o-2_6")
+
+
+def run_minicpmv(question: str, modality: str):
+    return run_minicpmv_base(question, modality, "openbmb/MiniCPM-V-2_6")
+
+
 # LLama 3.2
 def run_mllama(question: str, modality: str):
     assert modality == "image"
@@ -523,6 +546,7 @@ model_example_map = {
     "llava-next-video": run_llava_next_video,
     "llava-onevision": run_llava_onevision,
     "mantis": run_mantis,
+    "minicpmo": run_minicpmo,
     "minicpmv": run_minicpmv,
     "mllama": run_mllama,
     "molmo": run_molmo,
diff --git a/examples/offline_inference/vision_language_embedding.py b/examples/offline_inference/vision_language_embedding.py
index 4ce3d496bf45b..3075fbbfa0f36 100644
--- a/examples/offline_inference/vision_language_embedding.py
+++ b/examples/offline_inference/vision_language_embedding.py
@@ -1,3 +1,4 @@
+# SPDX-License-Identifier: Apache-2.0
 """
 This example shows how to use vLLM for running offline inference with
 the correct prompt format on vision language models for multimodal embedding.
diff --git a/examples/offline_inference/vision_language_multi_image.py b/examples/offline_inference/vision_language_multi_image.py
index 43c44fa867e0a..601ac96e16eac 100644
--- a/examples/offline_inference/vision_language_multi_image.py
+++ b/examples/offline_inference/vision_language_multi_image.py
@@ -1,3 +1,4 @@
+# SPDX-License-Identifier: Apache-2.0
 """
 This example shows how to use vLLM for running offline inference with
 multi-image input on vision language models for text generation,
diff --git a/examples/offline_inference/whisper.py b/examples/offline_inference/whisper.py
index 087ad4376fb2e..59c119a772dab 100644
--- a/examples/offline_inference/whisper.py
+++ b/examples/offline_inference/whisper.py
@@ -1,3 +1,5 @@
+# SPDX-License-Identifier: Apache-2.0
+
 import time
 
 from vllm import LLM, SamplingParams
diff --git a/examples/online_serving/api_client.py b/examples/online_serving/api_client.py
index 49a085febdc57..623e0d59a30e3 100644
--- a/examples/online_serving/api_client.py
+++ b/examples/online_serving/api_client.py
@@ -1,3 +1,4 @@
+# SPDX-License-Identifier: Apache-2.0
 """Example Python client for `vllm.entrypoints.api_server`
 NOTE: The API server is used only for demonstration and simple performance
 benchmarks. It is not intended for production use.
diff --git a/examples/online_serving/cohere_rerank_client.py b/examples/online_serving/cohere_rerank_client.py
new file mode 100644
index 0000000000000..fc434ada1d156
--- /dev/null
+++ b/examples/online_serving/cohere_rerank_client.py
@@ -0,0 +1,33 @@
+# SPDX-License-Identifier: Apache-2.0
+"""
+Example of using the OpenAI entrypoint's rerank API which is compatible with
+the Cohere SDK: https://github.com/cohere-ai/cohere-python
+
+run: vllm serve BAAI/bge-reranker-base
+"""
+import cohere
+
+# cohere v1 client
+co = cohere.Client(base_url="http://localhost:8000", api_key="sk-fake-key")
+rerank_v1_result = co.rerank(
+    model="BAAI/bge-reranker-base",
+    query="What is the capital of France?",
+    documents=[
+        "The capital of France is Paris", "Reranking is fun!",
+        "vLLM is an open-source framework for fast AI serving"
+    ])
+
+print(rerank_v1_result)
+
+# or the v2
+co2 = cohere.ClientV2("sk-fake-key", base_url="http://localhost:8000")
+
+v2_rerank_result = co2.rerank(
+    model="BAAI/bge-reranker-base",
+    query="What is the capital of France?",
+    documents=[
+        "The capital of France is Paris", "Reranking is fun!",
+        "vLLM is an open-source framework for fast AI serving"
+    ])
+
+print(v2_rerank_result)
diff --git a/examples/online_serving/gradio_openai_chatbot_webserver.py b/examples/online_serving/gradio_openai_chatbot_webserver.py
index 8ceb8f68ea0ce..ee01e1eae6281 100644
--- a/examples/online_serving/gradio_openai_chatbot_webserver.py
+++ b/examples/online_serving/gradio_openai_chatbot_webserver.py
@@ -1,3 +1,5 @@
+# SPDX-License-Identifier: Apache-2.0
+
 import argparse
 
 import gradio as gr
diff --git a/examples/online_serving/gradio_webserver.py b/examples/online_serving/gradio_webserver.py
index 54e907582986f..c619146b03aed 100644
--- a/examples/online_serving/gradio_webserver.py
+++ b/examples/online_serving/gradio_webserver.py
@@ -1,3 +1,5 @@
+# SPDX-License-Identifier: Apache-2.0
+
 import argparse
 import json
 
diff --git a/examples/online_serving/jinaai_rerank_client.py b/examples/online_serving/jinaai_rerank_client.py
new file mode 100644
index 0000000000000..3e760e1717883
--- /dev/null
+++ b/examples/online_serving/jinaai_rerank_client.py
@@ -0,0 +1,34 @@
+# SPDX-License-Identifier: Apache-2.0
+"""
+Example of using the OpenAI entrypoint's rerank API which is compatible with
+Jina and Cohere https://jina.ai/reranker
+
+run: vllm serve BAAI/bge-reranker-base
+"""
+import json
+
+import requests
+
+url = "http://127.0.0.1:8000/rerank"
+
+headers = {"accept": "application/json", "Content-Type": "application/json"}
+
+data = {
+    "model":
+    "BAAI/bge-reranker-base",
+    "query":
+    "What is the capital of France?",
+    "documents": [
+        "The capital of Brazil is Brasilia.",
+        "The capital of France is Paris.", "Horses and cows are both animals"
+    ]
+}
+response = requests.post(url, headers=headers, json=data)
+
+# Check the response
+if response.status_code == 200:
+    print("Request successful!")
+    print(json.dumps(response.json(), indent=2))
+else:
+    print(f"Request failed with status code: {response.status_code}")
+    print(response.text)
diff --git a/examples/online_serving/openai_chat_completion_client.py b/examples/online_serving/openai_chat_completion_client.py
index bbada3891bd19..a815620411309 100644
--- a/examples/online_serving/openai_chat_completion_client.py
+++ b/examples/online_serving/openai_chat_completion_client.py
@@ -1,3 +1,5 @@
+# SPDX-License-Identifier: Apache-2.0
+
 from openai import OpenAI
 
 # Modify OpenAI's API key and API base to use vLLM's API server.
diff --git a/examples/online_serving/openai_chat_completion_client_for_multimodal.py b/examples/online_serving/openai_chat_completion_client_for_multimodal.py
index 03cc037bb6779..d5f798a8dae62 100644
--- a/examples/online_serving/openai_chat_completion_client_for_multimodal.py
+++ b/examples/online_serving/openai_chat_completion_client_for_multimodal.py
@@ -1,3 +1,4 @@
+# SPDX-License-Identifier: Apache-2.0
 """An example showing how to use vLLM to serve multimodal models 
 and run online serving with OpenAI client.
 
diff --git a/examples/online_serving/openai_chat_completion_client_with_tools.py b/examples/online_serving/openai_chat_completion_client_with_tools.py
index 2bbe42b6bd2ef..416fb61ca8bb5 100644
--- a/examples/online_serving/openai_chat_completion_client_with_tools.py
+++ b/examples/online_serving/openai_chat_completion_client_with_tools.py
@@ -1,3 +1,4 @@
+# SPDX-License-Identifier: Apache-2.0
 """
 Set up this example by starting a vLLM OpenAI-compatible server with tool call
 options enabled. For example:
diff --git a/examples/online_serving/openai_chat_completion_structured_outputs.py b/examples/online_serving/openai_chat_completion_structured_outputs.py
index 8c059c7ca07ce..cddd9318000b2 100644
--- a/examples/online_serving/openai_chat_completion_structured_outputs.py
+++ b/examples/online_serving/openai_chat_completion_structured_outputs.py
@@ -1,3 +1,5 @@
+# SPDX-License-Identifier: Apache-2.0
+
 from enum import Enum
 
 from openai import OpenAI
diff --git a/examples/online_serving/openai_chat_completion_with_reasoning.py b/examples/online_serving/openai_chat_completion_with_reasoning.py
new file mode 100644
index 0000000000000..a88c8adb55c28
--- /dev/null
+++ b/examples/online_serving/openai_chat_completion_with_reasoning.py
@@ -0,0 +1,54 @@
+# SPDX-License-Identifier: Apache-2.0
+"""
+An example shows how to generate chat completions from reasoning models
+like DeepSeekR1.
+
+To run this example, you need to start the vLLM server with the reasoning 
+parser:
+
+```bash
+vllm serve deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B \
+     --enable-reasoning --reasoning-parser deepseek_r1
+```
+
+This example demonstrates how to generate chat completions from reasoning models
+using the OpenAI Python client library.
+"""
+
+from openai import OpenAI
+
+# Modify OpenAI's API key and API base to use vLLM's API server.
+openai_api_key = "EMPTY"
+openai_api_base = "http://localhost:8000/v1"
+
+client = OpenAI(
+    api_key=openai_api_key,
+    base_url=openai_api_base,
+)
+
+models = client.models.list()
+model = models.data[0].id
+
+# Round 1
+messages = [{"role": "user", "content": "9.11 and 9.8, which is greater?"}]
+response = client.chat.completions.create(model=model, messages=messages)
+
+reasoning_content = response.choices[0].message.reasoning_content
+content = response.choices[0].message.content
+
+print("reasoning_content:", reasoning_content)
+print("content:", content)
+
+# Round 2
+messages.append({"role": "assistant", "content": content})
+messages.append({
+    "role": "user",
+    "content": "How many Rs are there in the word 'strawberry'?",
+})
+response = client.chat.completions.create(model=model, messages=messages)
+
+reasoning_content = response.choices[0].message.reasoning_content
+content = response.choices[0].message.content
+
+print("reasoning_content:", reasoning_content)
+print("content:", content)
diff --git a/examples/online_serving/openai_chat_completion_with_reasoning_streaming.py b/examples/online_serving/openai_chat_completion_with_reasoning_streaming.py
new file mode 100644
index 0000000000000..489bfcd5ec2a2
--- /dev/null
+++ b/examples/online_serving/openai_chat_completion_with_reasoning_streaming.py
@@ -0,0 +1,91 @@
+# SPDX-License-Identifier: Apache-2.0
+"""
+An example shows how to generate chat completions from reasoning models
+like DeepSeekR1.
+
+To run this example, you need to start the vLLM server with the reasoning 
+parser:
+
+```bash
+vllm serve deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B \
+     --enable-reasoning --reasoning-parser deepseek_r1
+```
+
+Unlike openai_chat_completion_with_reasoning.py, this example demonstrates the
+streaming chat completions feature.
+
+The streaming chat completions feature allows you to receive chat completions
+in real-time as they are generated by the model. This is useful for scenarios
+where you want to display chat completions to the user as they are generated
+by the model.
+
+Here we do not use the OpenAI Python client library, because it does not support
+`reasoning_content` fields in the response.
+"""
+
+import json
+
+import requests
+
+# Modify OpenAI's API key and API base to use vLLM's API server.
+openai_api_key = "EMPTY"
+openai_api_base = "http://localhost:8000/v1"
+
+models = requests.get(
+    f"{openai_api_base}/models",
+    headers={
+        "Authorization": f"Bearer {openai_api_key}"
+    },
+).json()
+model = models["data"][0]["id"]
+
+# Streaming chat completions
+messages = [{"role": "user", "content": "9.11 and 9.8, which is greater?"}]
+
+response = requests.post(
+    f"{openai_api_base}/chat/completions",
+    headers={"Authorization": f"Bearer {openai_api_key}"},
+    json={
+        "model": model,
+        "messages": messages,
+        "stream": True
+    },
+)
+
+print("client: Start streaming chat completions...")
+printed_reasoning_content = False
+printed_content = False
+# Make the streaming request
+if response.status_code == 200:
+    # Process the streaming response
+    for line in response.iter_lines():
+        if line:  # Filter out keep-alive new lines
+            # Decode the line and parse the JSON
+            decoded_line = line.decode("utf-8")
+            if decoded_line.startswith("data:"):
+                data = decoded_line[5:].strip()  # Remove "data:" prefix
+                if data == "[DONE]":  # End of stream
+                    print("\nclient: Stream completed.")
+                    break
+                try:
+                    # Parse the JSON data
+                    chunk = json.loads(data)
+                    reasoning_content = chunk["choices"][0]["delta"].get(
+                        "reasoning_content", "")
+                    content = chunk["choices"][0]["delta"].get("content", "")
+
+                    if reasoning_content:
+                        if not printed_reasoning_content:
+                            printed_reasoning_content = True
+                            print("reasoning_content:", end="", flush=True)
+                        print(reasoning_content, end="", flush=True)
+                    elif content:
+                        if not printed_content:
+                            printed_content = True
+                            print("\ncontent:", end="", flush=True)
+                        # Extract and print the content
+                        print(content, end="", flush=True)
+                except json.JSONDecodeError:
+                    print("Error decoding JSON:", decoded_line)
+else:
+    print(f"Error: {response.status_code} - {response.text}")
diff --git a/examples/online_serving/openai_chat_embedding_client_for_multimodal.py b/examples/online_serving/openai_chat_embedding_client_for_multimodal.py
index a56e7429b7567..f49d7a228191c 100644
--- a/examples/online_serving/openai_chat_embedding_client_for_multimodal.py
+++ b/examples/online_serving/openai_chat_embedding_client_for_multimodal.py
@@ -1,3 +1,5 @@
+# SPDX-License-Identifier: Apache-2.0
+
 import argparse
 import base64
 import io
diff --git a/examples/online_serving/openai_completion_client.py b/examples/online_serving/openai_completion_client.py
index 58519f978d340..06b93d7d19315 100644
--- a/examples/online_serving/openai_completion_client.py
+++ b/examples/online_serving/openai_completion_client.py
@@ -1,3 +1,5 @@
+# SPDX-License-Identifier: Apache-2.0
+
 from openai import OpenAI
 
 # Modify OpenAI's API key and API base to use vLLM's API server.
diff --git a/examples/online_serving/openai_cross_encoder_score.py b/examples/online_serving/openai_cross_encoder_score.py
index 365a684d53f2b..67c5fc91bc65b 100644
--- a/examples/online_serving/openai_cross_encoder_score.py
+++ b/examples/online_serving/openai_cross_encoder_score.py
@@ -1,3 +1,4 @@
+# SPDX-License-Identifier: Apache-2.0
 """
 Example online usage of Score API.
 
diff --git a/examples/online_serving/openai_embedding_client.py b/examples/online_serving/openai_embedding_client.py
index 4bd7ca01d750d..cb110997464ac 100644
--- a/examples/online_serving/openai_embedding_client.py
+++ b/examples/online_serving/openai_embedding_client.py
@@ -1,3 +1,5 @@
+# SPDX-License-Identifier: Apache-2.0
+
 from openai import OpenAI
 
 # Modify OpenAI's API key and API base to use vLLM's API server.
diff --git a/examples/online_serving/openai_pooling_client.py b/examples/online_serving/openai_pooling_client.py
index 37ec8f2fb6be3..e17f9c5efd659 100644
--- a/examples/online_serving/openai_pooling_client.py
+++ b/examples/online_serving/openai_pooling_client.py
@@ -1,3 +1,4 @@
+# SPDX-License-Identifier: Apache-2.0
 """
 Example online usage of Pooling API.
 
diff --git a/examples/online_serving/opentelemetry/dummy_client.py b/examples/online_serving/opentelemetry/dummy_client.py
index b1a2b3c3c4aaf..7a605f85b97fe 100644
--- a/examples/online_serving/opentelemetry/dummy_client.py
+++ b/examples/online_serving/opentelemetry/dummy_client.py
@@ -1,3 +1,5 @@
+# SPDX-License-Identifier: Apache-2.0
+
 import requests
 from opentelemetry.exporter.otlp.proto.grpc.trace_exporter import (
     OTLPSpanExporter)
diff --git a/examples/online_serving/prometheus_grafana/README.md b/examples/online_serving/prometheus_grafana/README.md
index c49e5306a1cb4..4a85f953b0b4c 100644
--- a/examples/online_serving/prometheus_grafana/README.md
+++ b/examples/online_serving/prometheus_grafana/README.md
@@ -24,7 +24,7 @@ Submit some sample requests to the server:
 ```bash
 wget https://huggingface.co/datasets/anon8231489123/ShareGPT_Vicuna_unfiltered/resolve/main/ShareGPT_V3_unfiltered_cleaned_split.json
 
-python3 ../../benchmarks/benchmark_serving.py \
+python3 ../../../benchmarks/benchmark_serving.py \
     --model mistralai/Mistral-7B-v0.1 \
     --tokenizer mistralai/Mistral-7B-v0.1 \
     --endpoint /v1/completions \
diff --git a/examples/other/tensorize_vllm_model.py b/examples/other/tensorize_vllm_model.py
index 5fff1fdf502c9..68345e6cb98d9 100644
--- a/examples/other/tensorize_vllm_model.py
+++ b/examples/other/tensorize_vllm_model.py
@@ -1,3 +1,5 @@
+# SPDX-License-Identifier: Apache-2.0
+
 import argparse
 import dataclasses
 import json
diff --git a/find_cuda_init.py b/find_cuda_init.py
index 51db23102f9ac..0d13b2f862102 100644
--- a/find_cuda_init.py
+++ b/find_cuda_init.py
@@ -1,3 +1,5 @@
+# SPDX-License-Identifier: Apache-2.0
+
 import importlib
 import traceback
 from typing import Callable
diff --git a/format.sh b/format.sh
index 4bcd0be0c96e5..3e78bf9865f0d 100755
--- a/format.sh
+++ b/format.sh
@@ -1,5 +1,6 @@
 #!/bin/bash
 
 echo "vLLM linting system has been moved from format.sh to pre-commit hook."
-echo "Please run 'pip install -r requirements-lint.txt' and 'pre-commit install' to install the pre-commit hook."
+echo "Please run 'pip install -r requirements-lint.txt', followed by"
+echo "'pre-commit install --hook-type pre-commit --hook-type commit-msg' to install the pre-commit hook."
 echo "Then linters will run automatically before each commit."
diff --git a/pyproject.toml b/pyproject.toml
index 8f2e20d0f5800..9892967b82d79 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -111,6 +111,7 @@ markers = [
 ]
 
 [tool.pymarkdown]
+plugins.md004.style = "sublist" # ul-style
 plugins.md013.enabled = false # line-length
 plugins.md041.enabled = false # first-line-h1
 plugins.md033.enabled = false # inline-html
diff --git a/python_only_dev.py b/python_only_dev.py
index 7d95ac96e6e4b..a303697b780a6 100644
--- a/python_only_dev.py
+++ b/python_only_dev.py
@@ -1,3 +1,5 @@
+# SPDX-License-Identifier: Apache-2.0
+
 msg = """Old style python only build (without compilation) is deprecated, please check https://docs.vllm.ai/en/latest/getting_started/installation.html#python-only-build-without-compilation for the new way to do python only build (without compilation).
 
 TL;DR:
diff --git a/requirements-common.txt b/requirements-common.txt
index 7051ca8cb50cd..97e33a6dbd880 100644
--- a/requirements-common.txt
+++ b/requirements-common.txt
@@ -5,7 +5,7 @@ requests >= 2.26.0
 tqdm
 blake3
 py-cpuinfo
-transformers >= 4.45.2  # Required for Llama 3.2 and Qwen2-VL.
+transformers >= 4.48.2  # Required for Bamba model and Transformers backend.
 tokenizers >= 0.19.1  # Required for Llama 3.
 protobuf # Required by LlamaTokenizer.
 fastapi >= 0.107.0, < 0.113.0; python_version < '3.9'
diff --git a/requirements-cpu.txt b/requirements-cpu.txt
index 056fbf5a7adec..ecfa822e01186 100644
--- a/requirements-cpu.txt
+++ b/requirements-cpu.txt
@@ -3,6 +3,13 @@
 
 # Dependencies for CPUs
 torch==2.5.1+cpu; platform_machine != "ppc64le" and platform_machine != "aarch64" and platform_system != "Darwin"
-torch==2.5.1; platform_machine == "aarch64" or platform_system == "Darwin" 
-torchvision; platform_machine != "ppc64le"  # required for the image processor of phi3v, this must be updated alongside torch
+torch==2.5.1; platform_machine == "ppc64le" or platform_machine == "aarch64" or platform_system == "Darwin" 
+
+# required for the image processor of minicpm-o-2_6, this must be updated alongside torch
+torchaudio; platform_machine != "ppc64le"
+torchaudio==2.5.1; platform_machine == "ppc64le"
+
+# required for the image processor of phi3v, this must be updated alongside torch
+torchvision; platform_machine != "ppc64le"
+torchvision==0.20.1; platform_machine == "ppc64le"
 datasets # for benchmark scripts
diff --git a/requirements-cuda.txt b/requirements-cuda.txt
index 8002fbd8ee5b9..78fa360f2dc96 100644
--- a/requirements-cuda.txt
+++ b/requirements-cuda.txt
@@ -5,6 +5,7 @@
 ray[default] >= 2.9
 nvidia-ml-py >= 12.560.30 # for pynvml package
 torch == 2.5.1
+torchaudio==2.5.1
 # These must be updated alongside torch
 torchvision == 0.20.1 # Required for phi3v processor. See https://github.com/pytorch/vision?tab=readme-ov-file#installation for corresponding version
 xformers == 0.0.28.post3; platform_system == 'Linux' and platform_machine == 'x86_64'  # Requires PyTorch 2.5.1
diff --git a/requirements-test.in b/requirements-test.in
index bc76a91ad5356..229d743ec802b 100644
--- a/requirements-test.in
+++ b/requirements-test.in
@@ -12,6 +12,8 @@ decord # required for video tests
 einops # required for MPT, qwen-vl and Mamba
 httpx
 librosa # required for audio tests
+vector_quantize_pytorch # required for minicpmo_26 test
+vocos # required for minicpmo_26 test
 peft
 pqdm
 ray[adag]==2.40.0
@@ -19,12 +21,13 @@ sentence-transformers # required for embedding tests
 soundfile # required for audio tests
 timm # required for internvl test
 torch==2.5.1
+torchaudio==2.5.1
 transformers_stream_generator # required for qwen-vl test
 matplotlib # required for qwen-vl test
 mistral_common[opencv] >= 1.5.0 # required for pixtral test
 datamodel_code_generator # required for minicpm3 test
 lm-eval[api]==0.4.4 # required for model evaluation test
-
+transformers==4.48.2 
 # quantization
 bitsandbytes>=0.45.0
 buildkite-test-collector==0.1.9
diff --git a/requirements-test.txt b/requirements-test.txt
index 09e009c2e21f4..e032aac710dd0 100644
--- a/requirements-test.txt
+++ b/requirements-test.txt
@@ -2,7 +2,7 @@
 # This file is autogenerated by pip-compile with Python 3.12
 # by the following command:
 #
-#    python3.12 -m piptools compile requirements-test.in -o requirements-test.txt
+# python3.12 -m piptools compile requirements-test.in -o requirements-test.txt
 #
 absl-py==2.1.0
     # via rouge-score
@@ -106,9 +106,17 @@ dnspython==2.7.0
 docutils==0.16
     # via awscli
 einops==0.8.0
-    # via -r requirements-test.in
+    # via
+    #   -r requirements-test.in
+    #   encodec
+    #   vector-quantize-pytorch
+    #   vocos
+einx==0.3.0
+    # via vector-quantize-pytorch
 email-validator==2.2.0
     # via pydantic
+encodec==0.1.1
+    # via vocos
 evaluate==0.4.3
     # via lm-eval
 fastparquet==2024.11.0
@@ -125,6 +133,8 @@ filelock==3.16.1
     #   triton
 fonttools==4.54.1
     # via matplotlib
+frozendict==2.4.6
+    # via einx
 frozenlist==1.5.0
     # via
     #   aiohttp
@@ -159,6 +169,7 @@ huggingface-hub==0.26.2
     #   timm
     #   tokenizers
     #   transformers
+    #   vocos
 idna==3.10
     # via
     #   anyio
@@ -261,6 +272,8 @@ numpy==1.26.4
     #   cupy-cuda12x
     #   datasets
     #   decord
+    #   einx
+    #   encodec
     #   evaluate
     #   fastparquet
     #   genai-perf
@@ -283,6 +296,7 @@ numpy==1.26.4
     #   torchvision
     #   transformers
     #   tritonclient
+    #   vocos
 nvidia-cublas-cu12==12.4.5.8
     # via
     #   nvidia-cudnn-cu12
@@ -455,6 +469,7 @@ pyyaml==6.0.2
     #   responses
     #   timm
     #   transformers
+    #   vocos
 ray[adag]==2.40.0
     # via -r requirements-test.in
 redis==5.2.0
@@ -517,6 +532,7 @@ scipy==1.13.1
     #   scikit-learn
     #   sentence-transformers
     #   statsmodels
+    #   vocos
 sentence-transformers==3.2.1
     # via -r requirements-test.in
 sentencepiece==0.2.0
@@ -540,7 +556,9 @@ sqlitedict==2.1.0
 statsmodels==0.14.4
     # via genai-perf
 sympy==1.13.1
-    # via torch
+    # via
+    #   einx
+    #   torch
 tabledata==1.3.3
     # via pytablewriter
 tabulate==0.9.0
@@ -568,12 +586,21 @@ torch==2.5.1
     #   -r requirements-test.in
     #   accelerate
     #   bitsandbytes
+    #   encodec
     #   lm-eval
     #   peft
     #   sentence-transformers
     #   tensorizer
     #   timm
+    #   torchaudio
     #   torchvision
+    #   vector-quantize-pytorch
+    #   vocos
+torchaudio==2.5.1
+    # via
+    #   -r requirements-test.in
+    #   encodec
+    #   vocos
 torchvision==0.20.1
     # via timm
 tqdm==4.66.6
@@ -584,13 +611,15 @@ tqdm==4.66.6
     #   lm-eval
     #   nltk
     #   peft
+    #   pqdm
     #   sentence-transformers
     #   tqdm-multiprocess
     #   transformers
 tqdm-multiprocess==0.0.11
     # via lm-eval
-transformers==4.47.0
+transformers==4.48.2
     # via
+    #   -r requirements-test.in
     #   genai-perf
     #   lm-eval
     #   peft
@@ -615,6 +644,7 @@ typing-extensions==4.12.2
     #   huggingface-hub
     #   librosa
     #   mistral-common
+    #   pqdm
     #   pydantic
     #   pydantic-core
     #   torch
@@ -626,6 +656,10 @@ urllib3==2.2.3
     #   requests
     #   responses
     #   tritonclient
+vector-quantize-pytorch==1.21.2
+    # via -r requirements-test.in
+vocos==0.1.0
+    # via -r requirements-test.in
 word2number==1.1
     # via lm-eval
 xxhash==3.5.0
diff --git a/requirements-tpu.txt b/requirements-tpu.txt
index 8ab18b3770ae8..1abde714af7c9 100644
--- a/requirements-tpu.txt
+++ b/requirements-tpu.txt
@@ -13,13 +13,11 @@ ray[default]
 # Install torch_xla
 --pre
 --extra-index-url https://download.pytorch.org/whl/nightly/cpu
+--find-links https://storage.googleapis.com/libtpu-wheels/index.html
 --find-links https://storage.googleapis.com/libtpu-releases/index.html
 --find-links https://storage.googleapis.com/jax-releases/jax_nightly_releases.html
 --find-links https://storage.googleapis.com/jax-releases/jaxlib_nightly_releases.html
-torch==2.6.0.dev20241126+cpu
-torchvision==0.20.0.dev20241126+cpu
-torch_xla[tpu] @ https://storage.googleapis.com/pytorch-xla-releases/wheels/tpuvm/torch_xla-2.6.0.dev20241126-cp39-cp39-linux_x86_64.whl ; python_version == "3.9"
-torch_xla[tpu] @ https://storage.googleapis.com/pytorch-xla-releases/wheels/tpuvm/torch_xla-2.6.0.dev20241126-cp310-cp310-linux_x86_64.whl ; python_version == "3.10"
-torch_xla[tpu] @ https://storage.googleapis.com/pytorch-xla-releases/wheels/tpuvm/torch_xla-2.6.0.dev20241126-cp311-cp311-linux_x86_64.whl ; python_version == "3.11"
-jaxlib==0.4.36.dev20241122
-jax==0.4.36.dev20241122
+torch==2.6.0.dev20241216+cpu
+torch_xla[tpu, pallas] @ https://storage.googleapis.com/pytorch-xla-releases/wheels/tpuvm/torch_xla-2.7.0.dev20250124-cp39-cp39-linux_x86_64.whl ; python_version == "3.9"
+torch_xla[tpu, pallas] @ https://storage.googleapis.com/pytorch-xla-releases/wheels/tpuvm/torch_xla-2.7.0.dev20250124-cp310-cp310-linux_x86_64.whl ; python_version == "3.10"
+torch_xla[tpu, pallas] @ https://storage.googleapis.com/pytorch-xla-releases/wheels/tpuvm/torch_xla-2.7.0.dev20250124-cp311-cp311-linux_x86_64.whl ; python_version == "3.11"
diff --git a/setup.py b/setup.py
old mode 100644
new mode 100755
index 36c89d435c7b7..50265d46e7d64
--- a/setup.py
+++ b/setup.py
@@ -1,3 +1,5 @@
+# SPDX-License-Identifier: Apache-2.0
+
 import ctypes
 import importlib.util
 import logging
@@ -417,7 +419,7 @@ def get_rocm_version():
 
         if (get_rocm_core_version(ctypes.byref(major), ctypes.byref(minor),
                                   ctypes.byref(patch)) == 0):
-            return "%d.%d.%d" % (major.value, minor.value, patch.value)
+            return f"{major.value}.{minor.value}.{patch.value}"
         return None
     except Exception:
         return None
@@ -598,14 +600,21 @@ if _is_hip():
 
 if _is_cuda():
     ext_modules.append(CMakeExtension(name="vllm.vllm_flash_attn._vllm_fa2_C"))
-    ext_modules.append(CMakeExtension(name="vllm.vllm_flash_attn._vllm_fa3_C"))
+    if envs.VLLM_USE_PRECOMPILED or get_nvcc_cuda_version() >= Version("12.0"):
+        # FA3 requires CUDA 12.0 or later
+        ext_modules.append(
+            CMakeExtension(name="vllm.vllm_flash_attn._vllm_fa3_C"))
     ext_modules.append(CMakeExtension(name="vllm.cumem_allocator"))
 
 if _build_custom_ops():
     ext_modules.append(CMakeExtension(name="vllm._C"))
 
 package_data = {
-    "vllm": ["py.typed", "model_executor/layers/fused_moe/configs/*.json"]
+    "vllm": [
+        "py.typed",
+        "model_executor/layers/fused_moe/configs/*.json",
+        "model_executor/layers/quantization/utils/configs/*.json",
+    ]
 }
 
 if _no_device():
diff --git a/tests/async_engine/api_server_async_engine.py b/tests/async_engine/api_server_async_engine.py
index a3c9d5c6e0898..d9ac611644df8 100644
--- a/tests/async_engine/api_server_async_engine.py
+++ b/tests/async_engine/api_server_async_engine.py
@@ -1,3 +1,4 @@
+# SPDX-License-Identifier: Apache-2.0
 """vllm.entrypoints.api_server with some extra logging for testing."""
 from typing import Any, Dict, Iterable
 
diff --git a/tests/async_engine/test_api_server.py b/tests/async_engine/test_api_server.py
index 83c71b5cf6eb7..77f3fb0025a0f 100644
--- a/tests/async_engine/test_api_server.py
+++ b/tests/async_engine/test_api_server.py
@@ -1,3 +1,5 @@
+# SPDX-License-Identifier: Apache-2.0
+
 import subprocess
 import sys
 import time
@@ -25,27 +27,32 @@ def _query_server_long(prompt: str) -> dict:
 
 
 @pytest.fixture
-def api_server(tokenizer_pool_size: int, worker_use_ray: bool):
+def api_server(tokenizer_pool_size: int, distributed_executor_backend: str):
     script_path = Path(__file__).parent.joinpath(
         "api_server_async_engine.py").absolute()
     commands = [
-        sys.executable, "-u",
-        str(script_path), "--model", "facebook/opt-125m", "--host",
-        "127.0.0.1", "--tokenizer-pool-size",
-        str(tokenizer_pool_size)
+        sys.executable,
+        "-u",
+        str(script_path),
+        "--model",
+        "facebook/opt-125m",
+        "--host",
+        "127.0.0.1",
+        "--tokenizer-pool-size",
+        str(tokenizer_pool_size),
+        "--distributed-executor-backend",
+        distributed_executor_backend,
     ]
 
-    if worker_use_ray:
-        commands.append("--worker-use-ray")
     uvicorn_process = subprocess.Popen(commands)
     yield
     uvicorn_process.terminate()
 
 
 @pytest.mark.parametrize("tokenizer_pool_size", [0, 2])
-@pytest.mark.parametrize("worker_use_ray", [False, True])
+@pytest.mark.parametrize("distributed_executor_backend", ["mp", "ray"])
 def test_api_server(api_server, tokenizer_pool_size: int,
-                    worker_use_ray: bool):
+                    distributed_executor_backend: str):
     """
     Run the API server and test it.
 
diff --git a/tests/async_engine/test_async_llm_engine.py b/tests/async_engine/test_async_llm_engine.py
index 8a04693ba676d..ca29abc92850d 100644
--- a/tests/async_engine/test_async_llm_engine.py
+++ b/tests/async_engine/test_async_llm_engine.py
@@ -1,3 +1,5 @@
+# SPDX-License-Identifier: Apache-2.0
+
 import asyncio
 import os
 import uuid
diff --git a/tests/async_engine/test_request_tracker.py b/tests/async_engine/test_request_tracker.py
index 5668cc30d32c3..fd6d89d4e00de 100644
--- a/tests/async_engine/test_request_tracker.py
+++ b/tests/async_engine/test_request_tracker.py
@@ -1,3 +1,5 @@
+# SPDX-License-Identifier: Apache-2.0
+
 import pytest
 
 from vllm.engine.async_llm_engine import RequestTracker
diff --git a/tests/basic_correctness/test_basic_correctness.py b/tests/basic_correctness/test_basic_correctness.py
index 31a101e48e026..2792dfde733e5 100644
--- a/tests/basic_correctness/test_basic_correctness.py
+++ b/tests/basic_correctness/test_basic_correctness.py
@@ -1,3 +1,4 @@
+# SPDX-License-Identifier: Apache-2.0
 """Compare the short outputs of HF and vLLM when using greedy sampling.
 
 Run `pytest tests/basic_correctness/test_basic_correctness.py`.
@@ -61,9 +62,10 @@ def test_models(
     if backend == "FLASHINFER" and current_platform.is_rocm():
         pytest.skip("Flashinfer does not support ROCm/HIP.")
 
-    if backend == "XFORMERS" and model == "google/gemma-2-2b-it":
+    if backend in ("XFORMERS",
+                   "FLASHINFER") and model == "google/gemma-2-2b-it":
         pytest.skip(
-            "XFORMERS does not support gemma2 with full context length.")
+            f"{backend} does not support gemma2 with full context length.")
 
     os.environ["VLLM_ATTENTION_BACKEND"] = backend
 
diff --git a/tests/basic_correctness/test_chunked_prefill.py b/tests/basic_correctness/test_chunked_prefill.py
index 469d18a4dd7af..cefd54d1c71af 100644
--- a/tests/basic_correctness/test_chunked_prefill.py
+++ b/tests/basic_correctness/test_chunked_prefill.py
@@ -1,3 +1,4 @@
+# SPDX-License-Identifier: Apache-2.0
 """Compare the outputs of HF and vLLM when using greedy sampling.
 
 It tests chunked prefill. Chunked prefill can be enabled by
diff --git a/tests/basic_correctness/test_cpu_offload.py b/tests/basic_correctness/test_cpu_offload.py
index d7f36a7812802..b4d558ce22e4f 100644
--- a/tests/basic_correctness/test_cpu_offload.py
+++ b/tests/basic_correctness/test_cpu_offload.py
@@ -1,3 +1,5 @@
+# SPDX-License-Identifier: Apache-2.0
+
 from ..utils import compare_two_settings
 
 
diff --git a/tests/basic_correctness/test_cumem.py b/tests/basic_correctness/test_cumem.py
index 53f4ef08f36a2..da9239b094076 100644
--- a/tests/basic_correctness/test_cumem.py
+++ b/tests/basic_correctness/test_cumem.py
@@ -1,3 +1,5 @@
+# SPDX-License-Identifier: Apache-2.0
+
 import torch
 
 from vllm import LLM, SamplingParams
diff --git a/tests/basic_correctness/test_preemption.py b/tests/basic_correctness/test_preemption.py
index 4e502cfb5f4f8..6aaec6eef9de1 100644
--- a/tests/basic_correctness/test_preemption.py
+++ b/tests/basic_correctness/test_preemption.py
@@ -1,3 +1,4 @@
+# SPDX-License-Identifier: Apache-2.0
 """Compare the short outputs of HF and vLLM when using greedy sampling.
 
 VLLM_TEST_ENABLE_ARTIFICIAL_PREEMPT=1 has to be set before running this test.
@@ -29,10 +30,10 @@ def check_settings():
 
 
 @pytest.fixture
-def worker_use_ray() -> bool:
-    # When SPMD worker is used, use ray_use_worker=True
+def distributed_executor_backend() -> str:
+    # When SPMD worker is used, use distributed_executor_backend="ray"
     # to test delta input optimization works with preemption.
-    return envs.VLLM_USE_RAY_SPMD_WORKER
+    return "ray" if envs.VLLM_USE_RAY_SPMD_WORKER else "mp"
 
 
 @pytest.mark.parametrize("model", MODELS)
@@ -47,7 +48,7 @@ def test_chunked_prefill_recompute(
     dtype: str,
     max_tokens: int,
     chunked_prefill_token_size: int,
-    worker_use_ray: bool,
+    distributed_executor_backend: str,
 ) -> None:
     """Ensure that chunked prefill works with preemption."""
     max_num_seqs = min(chunked_prefill_token_size, 256)
@@ -66,7 +67,7 @@ def test_chunked_prefill_recompute(
             max_num_batched_tokens=max_num_batched_tokens,
             enable_chunked_prefill=enable_chunked_prefill,
             max_num_seqs=max_num_seqs,
-            worker_use_ray=worker_use_ray,
+            distributed_executor_backend=distributed_executor_backend,
             disable_log_stats=False,
     ) as vllm_model:
         vllm_outputs = vllm_model.generate_greedy(example_prompts, max_tokens)
@@ -93,7 +94,7 @@ def test_preemption(
     model: str,
     dtype: str,
     max_tokens: int,
-    worker_use_ray: bool,
+    distributed_executor_backend: str,
 ) -> None:
     """By default, recompute preemption is enabled"""
 
@@ -104,7 +105,7 @@ def test_preemption(
             model,
             dtype=dtype,
             disable_log_stats=False,
-            worker_use_ray=worker_use_ray,
+            distributed_executor_backend=distributed_executor_backend,
     ) as vllm_model:
         vllm_outputs = vllm_model.generate_greedy(example_prompts, max_tokens)
         assert (vllm_model.model.llm_engine.scheduler[0].artificial_preempt_cnt
@@ -144,7 +145,7 @@ def test_preemption_infeasible(
     model: str,
     dtype: str,
     max_tokens: int,
-    worker_use_ray: bool,
+    distributed_executor_backend: str,
 ) -> None:
     """Verify infeasible preemption request will be ignored."""
     BLOCK_SIZE = 16
@@ -159,7 +160,7 @@ def test_preemption_infeasible(
             # ignored instead of hanging forever.
             num_gpu_blocks_override=prefill_blocks + decode_blocks // 2,
             max_model_len=((prefill_blocks + decode_blocks // 2) * BLOCK_SIZE),
-            worker_use_ray=worker_use_ray,
+            distributed_executor_backend=distributed_executor_backend,
     ) as vllm_model:
         sampling_params = SamplingParams(max_tokens=max_tokens,
                                          ignore_eos=True)
diff --git a/tests/compile/backend.py b/tests/compile/backend.py
index 8fa10e5bd1b37..74bc58a2dd542 100644
--- a/tests/compile/backend.py
+++ b/tests/compile/backend.py
@@ -1,3 +1,5 @@
+# SPDX-License-Identifier: Apache-2.0
+
 from copy import deepcopy
 from typing import Callable, Union
 
diff --git a/tests/compile/piecewise/test_simple.py b/tests/compile/piecewise/test_simple.py
index aa11524812cdd..9d633ad259b13 100644
--- a/tests/compile/piecewise/test_simple.py
+++ b/tests/compile/piecewise/test_simple.py
@@ -1,3 +1,4 @@
+# SPDX-License-Identifier: Apache-2.0
 """
 Test the piecewise compilation with a simple model so that we
 can exactly calculate the expected output and side effects.
diff --git a/tests/compile/piecewise/test_toy_llama.py b/tests/compile/piecewise/test_toy_llama.py
index d4ede4d2320a7..0404722bab891 100644
--- a/tests/compile/piecewise/test_toy_llama.py
+++ b/tests/compile/piecewise/test_toy_llama.py
@@ -1,3 +1,4 @@
+# SPDX-License-Identifier: Apache-2.0
 """
 Test the piecewise compilation with a simple model, comparing the output
 with and without the piecewise compilation.
diff --git a/tests/compile/test_basic_correctness.py b/tests/compile/test_basic_correctness.py
index 87d5aefea6cb4..d7acec690d880 100644
--- a/tests/compile/test_basic_correctness.py
+++ b/tests/compile/test_basic_correctness.py
@@ -1,3 +1,5 @@
+# SPDX-License-Identifier: Apache-2.0
+
 import dataclasses
 from typing import Dict, List, Optional
 
@@ -58,7 +60,7 @@ test_settings = [
         model_args=["--task", "embed"],
         pp_size=1,
         tp_size=1,
-        attn_backend="FLASHINFER",
+        attn_backend="FLASH_ATTN",
         method="encode",
         fullgraph=True,
     ),
diff --git a/tests/compile/test_full_graph.py b/tests/compile/test_full_graph.py
index 4dfdfe21a67df..6e83fa36881e4 100644
--- a/tests/compile/test_full_graph.py
+++ b/tests/compile/test_full_graph.py
@@ -1,3 +1,5 @@
+# SPDX-License-Identifier: Apache-2.0
+
 import pytest
 
 from vllm.config import CompilationLevel
diff --git a/tests/compile/test_functionalization.py b/tests/compile/test_functionalization.py
index ea3aaee9565ec..8f5040522692a 100644
--- a/tests/compile/test_functionalization.py
+++ b/tests/compile/test_functionalization.py
@@ -1,3 +1,5 @@
+# SPDX-License-Identifier: Apache-2.0
+
 import pytest
 import torch
 
diff --git a/tests/compile/test_fusion.py b/tests/compile/test_fusion.py
index b4266a4a7db94..c14f0caab5399 100644
--- a/tests/compile/test_fusion.py
+++ b/tests/compile/test_fusion.py
@@ -1,3 +1,5 @@
+# SPDX-License-Identifier: Apache-2.0
+
 import pytest
 import torch
 from compressed_tensors.quantization import FP8_DTYPE
diff --git a/tests/compile/test_pass_manager.py b/tests/compile/test_pass_manager.py
index 03e7535093c5d..70920ab10ec2f 100644
--- a/tests/compile/test_pass_manager.py
+++ b/tests/compile/test_pass_manager.py
@@ -1,3 +1,5 @@
+# SPDX-License-Identifier: Apache-2.0
+
 import pickle
 
 import pytest
diff --git a/tests/compile/test_wrapper.py b/tests/compile/test_wrapper.py
index 74f66baaa5ea1..0934c61135792 100644
--- a/tests/compile/test_wrapper.py
+++ b/tests/compile/test_wrapper.py
@@ -1,3 +1,5 @@
+# SPDX-License-Identifier: Apache-2.0
+
 from typing import Optional
 
 import torch
diff --git a/tests/compile/utils.py b/tests/compile/utils.py
index 7c92d165d05f7..e4a88584e1587 100644
--- a/tests/compile/utils.py
+++ b/tests/compile/utils.py
@@ -1,3 +1,5 @@
+# SPDX-License-Identifier: Apache-2.0
+
 import os
 
 import torch
diff --git a/tests/conftest.py b/tests/conftest.py
index 279c1bf9a3776..85dd5bcb0dd11 100644
--- a/tests/conftest.py
+++ b/tests/conftest.py
@@ -1,3 +1,5 @@
+# SPDX-License-Identifier: Apache-2.0
+
 import json
 import os
 import tempfile
diff --git a/tests/core/block/conftest.py b/tests/core/block/conftest.py
index 0464d6a74da61..b7a9863f4aaf5 100644
--- a/tests/core/block/conftest.py
+++ b/tests/core/block/conftest.py
@@ -1,3 +1,5 @@
+# SPDX-License-Identifier: Apache-2.0
+
 import pytest
 
 
diff --git a/tests/core/block/e2e/conftest.py b/tests/core/block/e2e/conftest.py
index 70577ec052a2c..7d3ccaadaca19 100644
--- a/tests/core/block/e2e/conftest.py
+++ b/tests/core/block/e2e/conftest.py
@@ -1,3 +1,5 @@
+# SPDX-License-Identifier: Apache-2.0
+
 from typing import Callable, Iterable, Optional
 
 import pytest
diff --git a/tests/core/block/e2e/test_correctness.py b/tests/core/block/e2e/test_correctness.py
index 86502f613b187..e9b537ed5150e 100644
--- a/tests/core/block/e2e/test_correctness.py
+++ b/tests/core/block/e2e/test_correctness.py
@@ -1,3 +1,5 @@
+# SPDX-License-Identifier: Apache-2.0
+
 from itertools import cycle
 
 import pytest
diff --git a/tests/core/block/e2e/test_correctness_sliding_window.py b/tests/core/block/e2e/test_correctness_sliding_window.py
index 415d0bd8237df..c874608e40a23 100644
--- a/tests/core/block/e2e/test_correctness_sliding_window.py
+++ b/tests/core/block/e2e/test_correctness_sliding_window.py
@@ -1,3 +1,5 @@
+# SPDX-License-Identifier: Apache-2.0
+
 import random
 from typing import List
 
diff --git a/tests/core/block/test_block_manager.py b/tests/core/block/test_block_manager.py
index cfd749ad58694..68d9618ae245b 100644
--- a/tests/core/block/test_block_manager.py
+++ b/tests/core/block/test_block_manager.py
@@ -1,3 +1,5 @@
+# SPDX-License-Identifier: Apache-2.0
+
 import pytest
 
 from vllm.core.block.utils import (STR_NOT_IMPL_ENC_DEC_PREFIX_CACHE,
diff --git a/tests/core/block/test_block_table.py b/tests/core/block/test_block_table.py
index e2391a5680b36..d8cf0bec709ac 100644
--- a/tests/core/block/test_block_table.py
+++ b/tests/core/block/test_block_table.py
@@ -1,3 +1,5 @@
+# SPDX-License-Identifier: Apache-2.0
+
 from typing import List
 
 import pytest
diff --git a/tests/core/block/test_common.py b/tests/core/block/test_common.py
index cfdd3582ed2ef..20260873003df 100644
--- a/tests/core/block/test_common.py
+++ b/tests/core/block/test_common.py
@@ -1,3 +1,5 @@
+# SPDX-License-Identifier: Apache-2.0
+
 import random
 
 import pytest
diff --git a/tests/core/block/test_cpu_gpu_block_allocator.py b/tests/core/block/test_cpu_gpu_block_allocator.py
index a9e38d40444a9..a1414edd95622 100644
--- a/tests/core/block/test_cpu_gpu_block_allocator.py
+++ b/tests/core/block/test_cpu_gpu_block_allocator.py
@@ -1,3 +1,5 @@
+# SPDX-License-Identifier: Apache-2.0
+
 import pytest
 
 from vllm.core.block.cpu_gpu_block_allocator import CpuGpuBlockAllocator
diff --git a/tests/core/block/test_naive_block.py b/tests/core/block/test_naive_block.py
index 10d5964dcfe8a..0ca2a0b8054d8 100644
--- a/tests/core/block/test_naive_block.py
+++ b/tests/core/block/test_naive_block.py
@@ -1,3 +1,5 @@
+# SPDX-License-Identifier: Apache-2.0
+
 from typing import List, Optional
 
 import pytest
diff --git a/tests/core/block/test_prefix_caching_block.py b/tests/core/block/test_prefix_caching_block.py
index 6642174c17d8b..771627a57daca 100644
--- a/tests/core/block/test_prefix_caching_block.py
+++ b/tests/core/block/test_prefix_caching_block.py
@@ -1,3 +1,5 @@
+# SPDX-License-Identifier: Apache-2.0
+
 import math
 import random
 from typing import List, Optional
diff --git a/tests/core/test_chunked_prefill_scheduler.py b/tests/core/test_chunked_prefill_scheduler.py
index eaaf004df38b2..8da25aea457dc 100644
--- a/tests/core/test_chunked_prefill_scheduler.py
+++ b/tests/core/test_chunked_prefill_scheduler.py
@@ -1,3 +1,5 @@
+# SPDX-License-Identifier: Apache-2.0
+
 from typing import List
 from unittest.mock import MagicMock
 
diff --git a/tests/core/test_num_computed_tokens_update.py b/tests/core/test_num_computed_tokens_update.py
index bd4accab7f37d..a4a90144482bb 100644
--- a/tests/core/test_num_computed_tokens_update.py
+++ b/tests/core/test_num_computed_tokens_update.py
@@ -1,3 +1,5 @@
+# SPDX-License-Identifier: Apache-2.0
+
 import pytest
 
 from tests.conftest import VllmRunner
diff --git a/tests/core/test_scheduler.py b/tests/core/test_scheduler.py
index 8f6de84e566e7..dcc97ebaa7c56 100644
--- a/tests/core/test_scheduler.py
+++ b/tests/core/test_scheduler.py
@@ -1,3 +1,5 @@
+# SPDX-License-Identifier: Apache-2.0
+
 import time
 from collections import deque
 from typing import List, Set, Tuple
diff --git a/tests/core/test_scheduler_encoder_decoder.py b/tests/core/test_scheduler_encoder_decoder.py
index 16bea54936bc8..a4e3c73a5a7bb 100644
--- a/tests/core/test_scheduler_encoder_decoder.py
+++ b/tests/core/test_scheduler_encoder_decoder.py
@@ -1,3 +1,5 @@
+# SPDX-License-Identifier: Apache-2.0
+
 from typing import List
 
 import pytest  # noqa
diff --git a/tests/core/test_serialization.py b/tests/core/test_serialization.py
index d604e5250a3f9..64b3e148ee728 100644
--- a/tests/core/test_serialization.py
+++ b/tests/core/test_serialization.py
@@ -1,3 +1,5 @@
+# SPDX-License-Identifier: Apache-2.0
+
 import msgspec
 
 from vllm.executor.msgspec_utils import decode_hook, encode_hook
diff --git a/tests/core/utils.py b/tests/core/utils.py
index 16703cd19fa1e..fb77dccce1c9d 100644
--- a/tests/core/utils.py
+++ b/tests/core/utils.py
@@ -1,3 +1,5 @@
+# SPDX-License-Identifier: Apache-2.0
+
 import time
 from collections import defaultdict
 from typing import Any, Dict, List, Optional
diff --git a/tests/distributed/test_ca_buffer_sharing.py b/tests/distributed/test_ca_buffer_sharing.py
index fc4043cd3014e..72e7ebdb7b594 100644
--- a/tests/distributed/test_ca_buffer_sharing.py
+++ b/tests/distributed/test_ca_buffer_sharing.py
@@ -1,3 +1,5 @@
+# SPDX-License-Identifier: Apache-2.0
+
 # can only run on machines with p2p access across GPUs
 # can only run with torchrun:
 # torchrun --nproc_per_node=2 tests/distributed/test_ca_buffer_sharing.py
diff --git a/tests/distributed/test_comm_ops.py b/tests/distributed/test_comm_ops.py
index d01f187521fe6..bc916e8de07c4 100644
--- a/tests/distributed/test_comm_ops.py
+++ b/tests/distributed/test_comm_ops.py
@@ -1,3 +1,4 @@
+# SPDX-License-Identifier: Apache-2.0
 """Test the communication operators.
 
 Run `pytest tests/distributed/test_comm_ops.py`.
diff --git a/tests/distributed/test_custom_all_reduce.py b/tests/distributed/test_custom_all_reduce.py
index 4072616fd30e2..46887bca42a90 100644
--- a/tests/distributed/test_custom_all_reduce.py
+++ b/tests/distributed/test_custom_all_reduce.py
@@ -1,3 +1,5 @@
+# SPDX-License-Identifier: Apache-2.0
+
 import os
 import random
 
diff --git a/tests/distributed/test_distributed_oot.py b/tests/distributed/test_distributed_oot.py
index 62e77a2f77597..4b0c65d1d3a47 100644
--- a/tests/distributed/test_distributed_oot.py
+++ b/tests/distributed/test_distributed_oot.py
@@ -1,3 +1,5 @@
+# SPDX-License-Identifier: Apache-2.0
+
 from ..entrypoints.openai.test_oot_registration import (
     run_and_test_dummy_opt_api_server)
 
diff --git a/tests/distributed/test_multi_node_assignment.py b/tests/distributed/test_multi_node_assignment.py
index 9f9c0ff07ee37..c86d2d8a0061a 100644
--- a/tests/distributed/test_multi_node_assignment.py
+++ b/tests/distributed/test_multi_node_assignment.py
@@ -1,3 +1,4 @@
+# SPDX-License-Identifier: Apache-2.0
 """Make sure ray assigns GPU workers to the correct node.
 
 Run:
diff --git a/tests/distributed/test_pipeline_parallel.py b/tests/distributed/test_pipeline_parallel.py
index ddbf40f089407..5b6741d74efc0 100644
--- a/tests/distributed/test_pipeline_parallel.py
+++ b/tests/distributed/test_pipeline_parallel.py
@@ -1,3 +1,4 @@
+# SPDX-License-Identifier: Apache-2.0
 """
 WARNING: This test runs in both single-node (4 GPUs) and multi-node
  (2 node with 2 GPUs each) modes. If the test only uses 2 GPUs, it is
diff --git a/tests/distributed/test_pipeline_partition.py b/tests/distributed/test_pipeline_partition.py
index 2d4d07dd27522..3ed104820b476 100644
--- a/tests/distributed/test_pipeline_partition.py
+++ b/tests/distributed/test_pipeline_partition.py
@@ -1,3 +1,5 @@
+# SPDX-License-Identifier: Apache-2.0
+
 import os
 
 import pytest
diff --git a/tests/distributed/test_pp_cudagraph.py b/tests/distributed/test_pp_cudagraph.py
index 4912858d8279e..3bc85b05e7d15 100644
--- a/tests/distributed/test_pp_cudagraph.py
+++ b/tests/distributed/test_pp_cudagraph.py
@@ -1,3 +1,5 @@
+# SPDX-License-Identifier: Apache-2.0
+
 import os
 
 import pytest
diff --git a/tests/distributed/test_pynccl.py b/tests/distributed/test_pynccl.py
index a8571a1157892..4c42a0ed81125 100644
--- a/tests/distributed/test_pynccl.py
+++ b/tests/distributed/test_pynccl.py
@@ -1,3 +1,5 @@
+# SPDX-License-Identifier: Apache-2.0
+
 import multiprocessing
 import os
 from typing import Dict, List
diff --git a/tests/distributed/test_same_node.py b/tests/distributed/test_same_node.py
index 62311a626bc47..9b1bbd6e545c1 100644
--- a/tests/distributed/test_same_node.py
+++ b/tests/distributed/test_same_node.py
@@ -1,3 +1,5 @@
+# SPDX-License-Identifier: Apache-2.0
+
 import os
 
 import torch.distributed as dist
diff --git a/tests/distributed/test_shm_broadcast.py b/tests/distributed/test_shm_broadcast.py
index 723872682cf97..59fa7cc9f319b 100644
--- a/tests/distributed/test_shm_broadcast.py
+++ b/tests/distributed/test_shm_broadcast.py
@@ -1,3 +1,5 @@
+# SPDX-License-Identifier: Apache-2.0
+
 import multiprocessing
 import random
 import time
diff --git a/tests/distributed/test_torchrun_example.py b/tests/distributed/test_torchrun_example.py
index 7aa03d7f0402a..a092a548a59c6 100644
--- a/tests/distributed/test_torchrun_example.py
+++ b/tests/distributed/test_torchrun_example.py
@@ -1,3 +1,5 @@
+# SPDX-License-Identifier: Apache-2.0
+
 # unit test for `examples/offline_inference/torchrun_example.py`
 
 import random
diff --git a/tests/distributed/test_utils.py b/tests/distributed/test_utils.py
index 5fb1ae7b29fd2..4432950f274e0 100644
--- a/tests/distributed/test_utils.py
+++ b/tests/distributed/test_utils.py
@@ -1,3 +1,5 @@
+# SPDX-License-Identifier: Apache-2.0
+
 import socket
 
 import pytest
diff --git a/tests/encoder_decoder/test_e2e_correctness.py b/tests/encoder_decoder/test_e2e_correctness.py
index fa5d6a69a9bc8..d0e4f86250bbc 100644
--- a/tests/encoder_decoder/test_e2e_correctness.py
+++ b/tests/encoder_decoder/test_e2e_correctness.py
@@ -1,3 +1,4 @@
+# SPDX-License-Identifier: Apache-2.0
 """E2E tests to verify the correctness of the encoder-decoder framework
 
 Run `pytest tests/encoder_decoder/test_e2e_correctness.py`.
diff --git a/tests/engine/output_processor/test_multi_step.py b/tests/engine/output_processor/test_multi_step.py
index 88f3fad4c79f8..3ba3c4ec53a5e 100644
--- a/tests/engine/output_processor/test_multi_step.py
+++ b/tests/engine/output_processor/test_multi_step.py
@@ -1,3 +1,5 @@
+# SPDX-License-Identifier: Apache-2.0
+
 import random
 from unittest.mock import MagicMock
 
diff --git a/tests/engine/output_processor/test_stop_checker.py b/tests/engine/output_processor/test_stop_checker.py
index cc14e8cbf75df..e9ad8d1612102 100644
--- a/tests/engine/output_processor/test_stop_checker.py
+++ b/tests/engine/output_processor/test_stop_checker.py
@@ -1,3 +1,5 @@
+# SPDX-License-Identifier: Apache-2.0
+
 from unittest.mock import MagicMock
 
 import pytest
diff --git a/tests/engine/test_arg_utils.py b/tests/engine/test_arg_utils.py
index 4e269de9fc40b..8698d124e73ff 100644
--- a/tests/engine/test_arg_utils.py
+++ b/tests/engine/test_arg_utils.py
@@ -1,3 +1,5 @@
+# SPDX-License-Identifier: Apache-2.0
+
 from argparse import ArgumentTypeError
 
 import pytest
diff --git a/tests/engine/test_computed_prefix_blocks.py b/tests/engine/test_computed_prefix_blocks.py
index ed35212cc3f11..dca8fa6026ab8 100644
--- a/tests/engine/test_computed_prefix_blocks.py
+++ b/tests/engine/test_computed_prefix_blocks.py
@@ -1,3 +1,5 @@
+# SPDX-License-Identifier: Apache-2.0
+
 import pytest
 
 from vllm.engine.arg_utils import EngineArgs
diff --git a/tests/engine/test_custom_executor.py b/tests/engine/test_custom_executor.py
index 0e33f3662da82..3e77faecbd3f5 100644
--- a/tests/engine/test_custom_executor.py
+++ b/tests/engine/test_custom_executor.py
@@ -1,3 +1,5 @@
+# SPDX-License-Identifier: Apache-2.0
+
 import asyncio
 import os
 from typing import Any, Callable, Dict, List, Optional, Tuple, Union
diff --git a/tests/engine/test_detokenization.py b/tests/engine/test_detokenization.py
index f77f6d0725b6b..742176ea8b601 100644
--- a/tests/engine/test_detokenization.py
+++ b/tests/engine/test_detokenization.py
@@ -1,3 +1,5 @@
+# SPDX-License-Identifier: Apache-2.0
+
 import pytest
 
 from vllm.entrypoints.llm import LLM
diff --git a/tests/engine/test_multiproc_workers.py b/tests/engine/test_multiproc_workers.py
index 04505fcaae24b..f1fe58e35a32e 100644
--- a/tests/engine/test_multiproc_workers.py
+++ b/tests/engine/test_multiproc_workers.py
@@ -1,3 +1,5 @@
+# SPDX-License-Identifier: Apache-2.0
+
 import asyncio
 from concurrent.futures import ThreadPoolExecutor
 from functools import partial
diff --git a/tests/engine/test_short_mm_context.py b/tests/engine/test_short_mm_context.py
index a6ba7a131c506..d5111e3fda8fd 100644
--- a/tests/engine/test_short_mm_context.py
+++ b/tests/engine/test_short_mm_context.py
@@ -1,3 +1,5 @@
+# SPDX-License-Identifier: Apache-2.0
+
 import pytest
 
 from ..conftest import IMAGE_ASSETS
diff --git a/tests/engine/test_skip_tokenizer_init.py b/tests/engine/test_skip_tokenizer_init.py
index b8818af5614cf..655c8232ac77f 100644
--- a/tests/engine/test_skip_tokenizer_init.py
+++ b/tests/engine/test_skip_tokenizer_init.py
@@ -1,3 +1,5 @@
+# SPDX-License-Identifier: Apache-2.0
+
 import pytest
 
 from vllm.entrypoints.llm import LLM
diff --git a/tests/engine/test_stop_reason.py b/tests/engine/test_stop_reason.py
index b0bd6c4aa95d3..a50b388048c99 100644
--- a/tests/engine/test_stop_reason.py
+++ b/tests/engine/test_stop_reason.py
@@ -1,3 +1,4 @@
+# SPDX-License-Identifier: Apache-2.0
 """Test the different finish_reason="stop" situations during generation:
     1. One of the provided stop strings
     2. One of the provided stop tokens
diff --git a/tests/engine/test_stop_strings.py b/tests/engine/test_stop_strings.py
index 499935620c16a..0f633bb26da98 100644
--- a/tests/engine/test_stop_strings.py
+++ b/tests/engine/test_stop_strings.py
@@ -1,3 +1,5 @@
+# SPDX-License-Identifier: Apache-2.0
+
 from typing import Any, List, Optional
 
 import pytest
diff --git a/tests/entrypoints/conftest.py b/tests/entrypoints/conftest.py
index ef74062ce4b41..b00e168db9d32 100644
--- a/tests/entrypoints/conftest.py
+++ b/tests/entrypoints/conftest.py
@@ -1,3 +1,5 @@
+# SPDX-License-Identifier: Apache-2.0
+
 import pytest
 
 
diff --git a/tests/entrypoints/llm/test_accuracy.py b/tests/entrypoints/llm/test_accuracy.py
index 6bf7190a656b8..29ff00df6d508 100644
--- a/tests/entrypoints/llm/test_accuracy.py
+++ b/tests/entrypoints/llm/test_accuracy.py
@@ -1,3 +1,4 @@
+# SPDX-License-Identifier: Apache-2.0
 """
 This file test accuracy of the vLLM server via LMEval.
 It uses local-completions, which interacts with vLLM
diff --git a/tests/entrypoints/llm/test_chat.py b/tests/entrypoints/llm/test_chat.py
index fc66386fd2d2a..77c80b2f89448 100644
--- a/tests/entrypoints/llm/test_chat.py
+++ b/tests/entrypoints/llm/test_chat.py
@@ -1,3 +1,5 @@
+# SPDX-License-Identifier: Apache-2.0
+
 from typing import List
 
 import pytest
diff --git a/tests/entrypoints/llm/test_collective_rpc.py b/tests/entrypoints/llm/test_collective_rpc.py
index 22473ce275295..39d4810de9e7b 100644
--- a/tests/entrypoints/llm/test_collective_rpc.py
+++ b/tests/entrypoints/llm/test_collective_rpc.py
@@ -1,3 +1,5 @@
+# SPDX-License-Identifier: Apache-2.0
+
 import pytest
 
 from vllm import LLM
diff --git a/tests/entrypoints/llm/test_encode.py b/tests/entrypoints/llm/test_encode.py
index 3906ad766e0b6..ebec8baba38df 100644
--- a/tests/entrypoints/llm/test_encode.py
+++ b/tests/entrypoints/llm/test_encode.py
@@ -1,3 +1,5 @@
+# SPDX-License-Identifier: Apache-2.0
+
 import weakref
 from typing import List
 
diff --git a/tests/entrypoints/llm/test_generate.py b/tests/entrypoints/llm/test_generate.py
index 7d2b377752725..4c78c2c8ee2ed 100644
--- a/tests/entrypoints/llm/test_generate.py
+++ b/tests/entrypoints/llm/test_generate.py
@@ -1,3 +1,5 @@
+# SPDX-License-Identifier: Apache-2.0
+
 import weakref
 from typing import List
 
diff --git a/tests/entrypoints/llm/test_generate_multiple_loras.py b/tests/entrypoints/llm/test_generate_multiple_loras.py
index eb2113692e7b4..90e1d58141378 100644
--- a/tests/entrypoints/llm/test_generate_multiple_loras.py
+++ b/tests/entrypoints/llm/test_generate_multiple_loras.py
@@ -1,3 +1,5 @@
+# SPDX-License-Identifier: Apache-2.0
+
 import weakref
 
 import pytest
diff --git a/tests/entrypoints/llm/test_gpu_utilization.py b/tests/entrypoints/llm/test_gpu_utilization.py
index c2dab300ecefb..c2b4a935886ba 100644
--- a/tests/entrypoints/llm/test_gpu_utilization.py
+++ b/tests/entrypoints/llm/test_gpu_utilization.py
@@ -1,3 +1,5 @@
+# SPDX-License-Identifier: Apache-2.0
+
 from vllm import LLM, SamplingParams
 
 
diff --git a/tests/entrypoints/llm/test_guided_generate.py b/tests/entrypoints/llm/test_guided_generate.py
index ccb9906fc5c0f..932a35a9950ec 100644
--- a/tests/entrypoints/llm/test_guided_generate.py
+++ b/tests/entrypoints/llm/test_guided_generate.py
@@ -1,3 +1,5 @@
+# SPDX-License-Identifier: Apache-2.0
+
 import json
 import re
 import weakref
diff --git a/tests/entrypoints/llm/test_init.py b/tests/entrypoints/llm/test_init.py
index c9a4ad44fea30..925bf56a93402 100644
--- a/tests/entrypoints/llm/test_init.py
+++ b/tests/entrypoints/llm/test_init.py
@@ -1,3 +1,5 @@
+# SPDX-License-Identifier: Apache-2.0
+
 import pytest
 
 from vllm import LLM
diff --git a/tests/entrypoints/llm/test_lazy_outlines.py b/tests/entrypoints/llm/test_lazy_outlines.py
index bf609b38a94f5..b1f9ae14da07c 100644
--- a/tests/entrypoints/llm/test_lazy_outlines.py
+++ b/tests/entrypoints/llm/test_lazy_outlines.py
@@ -1,3 +1,5 @@
+# SPDX-License-Identifier: Apache-2.0
+
 import sys
 from contextlib import nullcontext
 
diff --git a/tests/entrypoints/llm/test_prompt_validation.py b/tests/entrypoints/llm/test_prompt_validation.py
index ee7010a238114..f2c145fa3c2b9 100644
--- a/tests/entrypoints/llm/test_prompt_validation.py
+++ b/tests/entrypoints/llm/test_prompt_validation.py
@@ -1,3 +1,5 @@
+# SPDX-License-Identifier: Apache-2.0
+
 import pytest
 
 from vllm import LLM
diff --git a/tests/entrypoints/offline_mode/test_offline_mode.py b/tests/entrypoints/offline_mode/test_offline_mode.py
index 65699e609e4a8..eac76f2ba0fa5 100644
--- a/tests/entrypoints/offline_mode/test_offline_mode.py
+++ b/tests/entrypoints/offline_mode/test_offline_mode.py
@@ -1,3 +1,4 @@
+# SPDX-License-Identifier: Apache-2.0
 """Tests for HF_HUB_OFFLINE mode"""
 import importlib
 import sys
diff --git a/tests/entrypoints/openai/reasoning_parsers/__init__.py b/tests/entrypoints/openai/reasoning_parsers/__init__.py
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/tests/entrypoints/openai/reasoning_parsers/test_deepseekr1_reasoning_parser.py b/tests/entrypoints/openai/reasoning_parsers/test_deepseekr1_reasoning_parser.py
new file mode 100644
index 0000000000000..f7b81be48bd11
--- /dev/null
+++ b/tests/entrypoints/openai/reasoning_parsers/test_deepseekr1_reasoning_parser.py
@@ -0,0 +1,122 @@
+# SPDX-License-Identifier: Apache-2.0
+
+from typing import List
+
+import pytest
+from transformers import AutoTokenizer
+
+from tests.entrypoints.openai.reasoning_parsers.utils import (
+    run_reasoning_extraction)
+from vllm.entrypoints.openai.reasoning_parsers import (ReasoningParser,
+                                                       ReasoningParserManager)
+
+parser_name = "deepseek_r1"
+start_token = "<think>"
+end_token = "</think>"
+
+SIMPLE_REASONING = {
+    "output": "<think>This is a reasoning section</think>This is the rest",
+    "reasoning_content": "This is a reasoning section",
+    "content": "This is the rest",
+}
+COMPLETE_REASONING = {
+    "output": "<think>This is a reasoning section</think>",
+    "reasoning_content": "This is a reasoning section",
+    "content": None,
+}
+NO_REASONING = {
+    "output": "This is a reasoning section",
+    "reasoning_content": None,
+    "content": "This is a reasoning section",
+}
+MULTIPLE_LINES = {
+    "output": "<think>This\nThat</think>This is the rest\nThat",
+    "reasoning_content": "This\nThat",
+    "content": "This is the rest\nThat",
+}
+SHORTEST_REASONING_NO_STREAMING = {
+    "output": "<think></think>This is the rest",
+    "reasoning_content": "",
+    "content": "This is the rest",
+}
+SHORTEST_REASONING = {
+    "output": "<think></think>This is the rest",
+    "reasoning_content": None,
+    "content": "This is the rest",
+}
+
+TEST_CASES = [
+    pytest.param(
+        False,
+        SIMPLE_REASONING,
+        id="simple_streaming",
+    ),
+    pytest.param(
+        True,
+        SIMPLE_REASONING,
+        id="simple_streaming",
+    ),
+    pytest.param(
+        False,
+        COMPLETE_REASONING,
+        id="complete_streaming",
+    ),
+    pytest.param(
+        True,
+        COMPLETE_REASONING,
+        id="complete_streaming",
+    ),
+    pytest.param(
+        False,
+        NO_REASONING,
+        id="no_streaming",
+    ),
+    pytest.param(
+        True,
+        NO_REASONING,
+        id="no_streaming",
+    ),
+    pytest.param(
+        False,
+        MULTIPLE_LINES,
+        id="multiple_lines_streaming",
+    ),
+    pytest.param(
+        True,
+        MULTIPLE_LINES,
+        id="multiple_lines_streaming",
+    ),
+    pytest.param(
+        True,
+        SHORTEST_REASONING,
+        id="shortest_streaming",
+    ),
+    pytest.param(
+        False,
+        SHORTEST_REASONING_NO_STREAMING,
+        id="shortest_streaming",
+    ),
+]
+
+
+@pytest.mark.parametrize("streaming, param_dict", TEST_CASES)
+def test_reasoning(
+    streaming: bool,
+    param_dict: dict,
+):
+    tokenizer = AutoTokenizer.from_pretrained("facebook/opt-125m")
+    tokenizer.add_tokens([start_token, end_token])
+    output = tokenizer.tokenize(param_dict["output"])
+    # decode everything to tokens
+    output_tokens: List[str] = [
+        tokenizer.convert_tokens_to_string([token]) for token in output
+    ]
+    parser: ReasoningParser = ReasoningParserManager.get_reasoning_parser(
+        parser_name)(tokenizer)
+
+    reasoning, content = run_reasoning_extraction(parser,
+                                                  output_tokens,
+                                                  streaming=streaming)
+
+    assert reasoning == param_dict["reasoning_content"]
+    assert content == param_dict["content"]
diff --git a/tests/entrypoints/openai/reasoning_parsers/utils.py b/tests/entrypoints/openai/reasoning_parsers/utils.py
new file mode 100644
index 0000000000000..2157e059594b4
--- /dev/null
+++ b/tests/entrypoints/openai/reasoning_parsers/utils.py
@@ -0,0 +1,95 @@
+# SPDX-License-Identifier: Apache-2.0
+
+from typing import List, Optional, Tuple, Union
+
+from vllm.entrypoints.openai.protocol import (ChatCompletionRequest,
+                                              DeltaMessage)
+from vllm.entrypoints.openai.reasoning_parsers import ReasoningParser
+
+
+class StreamingReasoningReconstructor:
+
+    def __init__(self):
+        self.reasoning_content = None
+        self.other_content = None
+
+    def append_delta(self, delta: DeltaMessage):
+        # content and the reasoning content should not be present
+        # at the same time
+        assert delta.content is None or delta.reasoning_content is None, (
+            "Both content and reasoning content are present in the "
+            "delta message")
+        if delta.content is not None:
+            if self.other_content is None:
+                self.other_content = delta.content
+            else:
+                self.other_content += delta.content
+        else:
+            if self.reasoning_content is None:
+                self.reasoning_content = delta.reasoning_content
+            else:
+                self.reasoning_content += delta.reasoning_content
+
+
+def run_reasoning_extraction(
+    reasoning_parser: ReasoningParser,
+    model_output: List[str],
+    request: Union[ChatCompletionRequest, None] = None,
+    streaming: bool = False,
+) -> Tuple[Optional[str], Optional[str]]:
+    if streaming:
+        reconstructor = run_reasoning_extraction_streaming(
+            reasoning_parser,
+            model_output,
+            request,
+        )
+        return (
+            reconstructor.reasoning_content,
+            reconstructor.other_content or None,
+        )
+    else:
+        reasoning, content = run_reasoning_extraction_nonstreaming(
+            reasoning_parser, model_output, request)
+        return reasoning, content
+
+
+def run_reasoning_extraction_nonstreaming(
+    reasoning_parser: ReasoningParser,
+    model_output: List[str],
+    request: Union[ChatCompletionRequest, None] = None,
+) -> Tuple[Optional[str], Optional[str]]:
+    request = request or ChatCompletionRequest(messages=[], model="test-model")
+    return reasoning_parser.extract_reasoning_content(
+        model_output=''.join(model_output), request=request)
+
+
+def run_reasoning_extraction_streaming(
+    reasoning_parser: ReasoningParser,
+    model_deltas: List[str],
+    request: Union[ChatCompletionRequest, None] = None,
+) -> StreamingReasoningReconstructor:
+    request = request or ChatCompletionRequest(messages=[], model="test-model")
+    reconstructor = StreamingReasoningReconstructor()
+    previous_text = ""
+    previous_tokens: List[int] = []
+    for delta in model_deltas:
+        token_delta = [
+            reasoning_parser.vocab.get(token)
+            for token in reasoning_parser.model_tokenizer.tokenize(delta)
+            if token in reasoning_parser.vocab
+        ]
+        current_text = previous_text + delta
+        current_tokens = previous_tokens + token_delta
+        delta_message = reasoning_parser.extract_reasoning_content_streaming(
+            previous_text,
+            current_text,
+            delta,
+            previous_tokens,
+            current_tokens,
+            token_delta,
+        )
+        if delta_message is not None:
+            reconstructor.append_delta(delta_message)
+        previous_text = current_text
+        previous_tokens = current_tokens
+    return reconstructor
diff --git a/tests/entrypoints/openai/test_accuracy.py b/tests/entrypoints/openai/test_accuracy.py
index b1d4461d164aa..df25780cd0f4f 100644
--- a/tests/entrypoints/openai/test_accuracy.py
+++ b/tests/entrypoints/openai/test_accuracy.py
@@ -1,3 +1,4 @@
+# SPDX-License-Identifier: Apache-2.0
 """
 This file test accuracy of the vLLM server via LMEval.
 It uses local-completions, which interacts with vLLM
diff --git a/tests/entrypoints/openai/test_async_tokenization.py b/tests/entrypoints/openai/test_async_tokenization.py
index fcce8b46c4344..1f7ba0da4f246 100644
--- a/tests/entrypoints/openai/test_async_tokenization.py
+++ b/tests/entrypoints/openai/test_async_tokenization.py
@@ -1,3 +1,5 @@
+# SPDX-License-Identifier: Apache-2.0
+
 import asyncio
 import contextlib
 import random
diff --git a/tests/entrypoints/openai/test_audio.py b/tests/entrypoints/openai/test_audio.py
index 1116c0da1a6f0..6e206dfd99b6a 100644
--- a/tests/entrypoints/openai/test_audio.py
+++ b/tests/entrypoints/openai/test_audio.py
@@ -1,3 +1,5 @@
+# SPDX-License-Identifier: Apache-2.0
+
 from typing import Dict, List
 
 import openai
diff --git a/tests/entrypoints/openai/test_basic.py b/tests/entrypoints/openai/test_basic.py
index 547c1fd020928..0d44a7611aed4 100644
--- a/tests/entrypoints/openai/test_basic.py
+++ b/tests/entrypoints/openai/test_basic.py
@@ -1,3 +1,5 @@
+# SPDX-License-Identifier: Apache-2.0
+
 import asyncio
 from http import HTTPStatus
 from typing import List
diff --git a/tests/entrypoints/openai/test_chat.py b/tests/entrypoints/openai/test_chat.py
index 5e6499d8f563c..4b5ad55c5eda0 100644
--- a/tests/entrypoints/openai/test_chat.py
+++ b/tests/entrypoints/openai/test_chat.py
@@ -1,3 +1,5 @@
+# SPDX-License-Identifier: Apache-2.0
+
 # imports for guided decoding tests
 import json
 import re
diff --git a/tests/entrypoints/openai/test_chat_echo.py b/tests/entrypoints/openai/test_chat_echo.py
index 223ac5b41aa83..3e76158a8c142 100644
--- a/tests/entrypoints/openai/test_chat_echo.py
+++ b/tests/entrypoints/openai/test_chat_echo.py
@@ -1,3 +1,5 @@
+# SPDX-License-Identifier: Apache-2.0
+
 from typing import NamedTuple
 
 import openai  # use the official client for correctness check
diff --git a/tests/entrypoints/openai/test_chat_template.py b/tests/entrypoints/openai/test_chat_template.py
index e1e1dcff7475d..255aba139ad32 100644
--- a/tests/entrypoints/openai/test_chat_template.py
+++ b/tests/entrypoints/openai/test_chat_template.py
@@ -1,3 +1,5 @@
+# SPDX-License-Identifier: Apache-2.0
+
 import pytest
 
 from vllm.entrypoints.chat_utils import (apply_hf_chat_template,
diff --git a/tests/entrypoints/openai/test_chunked_prompt.py b/tests/entrypoints/openai/test_chunked_prompt.py
index 61d66365130c7..0419395f1816b 100644
--- a/tests/entrypoints/openai/test_chunked_prompt.py
+++ b/tests/entrypoints/openai/test_chunked_prompt.py
@@ -1,3 +1,5 @@
+# SPDX-License-Identifier: Apache-2.0
+
 import openai  # use the official client for correctness check
 import pytest
 import pytest_asyncio
diff --git a/tests/entrypoints/openai/test_cli_args.py b/tests/entrypoints/openai/test_cli_args.py
index e49562ad6a21f..2f065ec1070e6 100644
--- a/tests/entrypoints/openai/test_cli_args.py
+++ b/tests/entrypoints/openai/test_cli_args.py
@@ -1,3 +1,5 @@
+# SPDX-License-Identifier: Apache-2.0
+
 import json
 
 import pytest
@@ -116,6 +118,35 @@ def test_enable_auto_choice_passes_with_tool_call_parser(serve_parser):
     validate_parsed_serve_args(args)
 
 
+def test_enable_auto_choice_fails_with_enable_reasoning(serve_parser):
+    """Ensure validation fails if reasoning is enabled with auto tool choice"""
+    args = serve_parser.parse_args(args=[
+        "--enable-auto-tool-choice",
+        "--enable-reasoning",
+    ])
+    with pytest.raises(TypeError):
+        validate_parsed_serve_args(args)
+
+
+def test_enable_reasoning_passes_with_reasoning_parser(serve_parser):
+    """Ensure validation passes if reasoning is enabled 
+    with a reasoning parser"""
+    args = serve_parser.parse_args(args=[
+        "--enable-reasoning",
+        "--reasoning-parser",
+        "deepseek_r1",
+    ])
+    validate_parsed_serve_args(args)
+
+
+def test_enable_reasoning_fails_without_reasoning_parser(serve_parser):
+    """Ensure validation fails if reasoning is enabled 
+    without a reasoning parser"""
+    args = serve_parser.parse_args(args=["--enable-reasoning"])
+    with pytest.raises(TypeError):
+        validate_parsed_serve_args(args)
+
+
 def test_chat_template_validation_for_happy_paths(serve_parser):
     """Ensure validation passes if the chat template exists"""
     args = serve_parser.parse_args(
diff --git a/tests/entrypoints/openai/test_completion.py b/tests/entrypoints/openai/test_completion.py
index 183d900c493e5..28671cc275714 100644
--- a/tests/entrypoints/openai/test_completion.py
+++ b/tests/entrypoints/openai/test_completion.py
@@ -1,3 +1,5 @@
+# SPDX-License-Identifier: Apache-2.0
+
 # imports for guided decoding tests
 import json
 import re
diff --git a/tests/entrypoints/openai/test_embedding.py b/tests/entrypoints/openai/test_embedding.py
index b52a5b28c9cff..e86ea87dd661c 100644
--- a/tests/entrypoints/openai/test_embedding.py
+++ b/tests/entrypoints/openai/test_embedding.py
@@ -1,3 +1,5 @@
+# SPDX-License-Identifier: Apache-2.0
+
 import base64
 
 import numpy as np
diff --git a/tests/entrypoints/openai/test_encoder_decoder.py b/tests/entrypoints/openai/test_encoder_decoder.py
index 51eba694e62ad..52b4df9ceecd7 100644
--- a/tests/entrypoints/openai/test_encoder_decoder.py
+++ b/tests/entrypoints/openai/test_encoder_decoder.py
@@ -1,3 +1,5 @@
+# SPDX-License-Identifier: Apache-2.0
+
 import openai
 import pytest
 import pytest_asyncio
diff --git a/tests/entrypoints/openai/test_lora_adapters.py b/tests/entrypoints/openai/test_lora_adapters.py
index 6ff99f6faa143..1a62157acc478 100644
--- a/tests/entrypoints/openai/test_lora_adapters.py
+++ b/tests/entrypoints/openai/test_lora_adapters.py
@@ -1,3 +1,5 @@
+# SPDX-License-Identifier: Apache-2.0
+
 import asyncio
 import json
 import shutil
diff --git a/tests/entrypoints/openai/test_metrics.py b/tests/entrypoints/openai/test_metrics.py
index 6523c8b6297c6..a9134be623229 100644
--- a/tests/entrypoints/openai/test_metrics.py
+++ b/tests/entrypoints/openai/test_metrics.py
@@ -1,3 +1,5 @@
+# SPDX-License-Identifier: Apache-2.0
+
 import subprocess
 import sys
 import tempfile
@@ -16,6 +18,24 @@ from ...utils import RemoteOpenAIServer
 MODEL_NAME = "TinyLlama/TinyLlama-1.1B-Chat-v1.0"
 
 
+@pytest.fixture(scope="module", params=[True, False])
+def use_v1(request):
+    # Module-scoped variant of run_with_both_engines
+    #
+    # Use this fixture to run a test with both v0 and v1, and
+    # also to conditionalize the test logic e.g.
+    #
+    # def test_metrics_exist(use_v1, server, client):
+    #     ...
+    #     expected = EXPECTED_V1_METRICS if use_v1 else EXPECTED_METRICS
+    #     for metric in expected:
+    #         assert metric in response.text
+    #
+    # @skip_v1 wouldn't work here because this is a module-level
+    # fixture - per-function decorators would have no effect
+    yield request.param
+
+
 @pytest.fixture(scope="module")
 def default_server_args():
     return [
@@ -36,10 +56,12 @@ def default_server_args():
                     "--enable-chunked-prefill",
                     "--disable-frontend-multiprocessing",
                 ])
-def server(default_server_args, request):
+def server(use_v1, default_server_args, request):
     if request.param:
         default_server_args.append(request.param)
-    with RemoteOpenAIServer(MODEL_NAME, default_server_args) as remote_server:
+    env_dict = dict(VLLM_USE_V1='1' if use_v1 else '0')
+    with RemoteOpenAIServer(MODEL_NAME, default_server_args,
+                            env_dict=env_dict) as remote_server:
         yield remote_server
 
 
@@ -84,7 +106,7 @@ EXPECTED_VALUES = {
 
 @pytest.mark.asyncio
 async def test_metrics_counts(server: RemoteOpenAIServer,
-                              client: openai.AsyncClient):
+                              client: openai.AsyncClient, use_v1: bool):
     for _ in range(_NUM_REQUESTS):
         # sending a request triggers the metrics to be logged.
         await client.completions.create(
@@ -98,6 +120,9 @@ async def test_metrics_counts(server: RemoteOpenAIServer,
 
     # Loop over all expected metric_families
     for metric_family, suffix_values_list in EXPECTED_VALUES.items():
+        if use_v1 and metric_family not in EXPECTED_METRICS_V1:
+            continue
+
         found_metric = False
 
         # Check to see if the metric_family is found in the prom endpoint.
@@ -174,10 +199,30 @@ EXPECTED_METRICS = [
     "swap_space_bytes",
 ]
 
+EXPECTED_METRICS_V1 = [
+    "vllm:num_requests_running",
+    "vllm:num_requests_waiting",
+    "vllm:gpu_cache_usage_perc",
+    "vllm:prompt_tokens_total",
+    "vllm:generation_tokens_total",
+    "vllm:request_prompt_tokens_sum",
+    "vllm:request_prompt_tokens_bucket",
+    "vllm:request_prompt_tokens_count",
+    "vllm:request_generation_tokens_sum",
+    "vllm:request_generation_tokens_bucket",
+    "vllm:request_generation_tokens_count",
+    "vllm:time_to_first_token_seconds_sum",
+    "vllm:time_to_first_token_seconds_bucket",
+    "vllm:time_to_first_token_seconds_count",
+    "vllm:time_per_output_token_seconds_sum",
+    "vllm:time_per_output_token_seconds_bucket",
+    "vllm:time_per_output_token_seconds_count",
+]
+
 
 @pytest.mark.asyncio
 async def test_metrics_exist(server: RemoteOpenAIServer,
-                             client: openai.AsyncClient):
+                             client: openai.AsyncClient, use_v1: bool):
     # sending a request triggers the metrics to be logged.
     await client.completions.create(model=MODEL_NAME,
                                     prompt="Hello, my name is",
@@ -187,11 +232,13 @@ async def test_metrics_exist(server: RemoteOpenAIServer,
     response = requests.get(server.url_for("metrics"))
     assert response.status_code == HTTPStatus.OK
 
-    for metric in EXPECTED_METRICS:
+    for metric in (EXPECTED_METRICS_V1 if use_v1 else EXPECTED_METRICS):
         assert metric in response.text
 
 
-def test_metrics_exist_run_batch():
+def test_metrics_exist_run_batch(use_v1: bool):
+    if use_v1:
+        pytest.skip("Skipping test on vllm V1")
     input_batch = """{"custom_id": "request-0", "method": "POST", "url": "/v1/embeddings", "body": {"model": "intfloat/e5-mistral-7b-instruct", "input": "You are a helpful assistant."}}"""  # noqa: E501
 
     base_url = "0.0.0.0"
diff --git a/tests/entrypoints/openai/test_models.py b/tests/entrypoints/openai/test_models.py
index ae5bf404d3d2b..3d4f1cde27895 100644
--- a/tests/entrypoints/openai/test_models.py
+++ b/tests/entrypoints/openai/test_models.py
@@ -1,3 +1,5 @@
+# SPDX-License-Identifier: Apache-2.0
+
 import openai  # use the official client for correctness check
 import pytest
 import pytest_asyncio
diff --git a/tests/entrypoints/openai/test_oot_registration.py b/tests/entrypoints/openai/test_oot_registration.py
index b25cb1d0e7222..a1b7a205a4575 100644
--- a/tests/entrypoints/openai/test_oot_registration.py
+++ b/tests/entrypoints/openai/test_oot_registration.py
@@ -1,3 +1,5 @@
+# SPDX-License-Identifier: Apache-2.0
+
 from ...utils import VLLM_PATH, RemoteOpenAIServer
 
 chatml_jinja_path = VLLM_PATH / "examples/template_chatml.jinja"
diff --git a/tests/entrypoints/openai/test_pooling.py b/tests/entrypoints/openai/test_pooling.py
index 9c49239398cd2..11d3bfafab1cc 100644
--- a/tests/entrypoints/openai/test_pooling.py
+++ b/tests/entrypoints/openai/test_pooling.py
@@ -1,3 +1,5 @@
+# SPDX-License-Identifier: Apache-2.0
+
 import base64
 
 import numpy as np
diff --git a/tests/entrypoints/openai/test_prompt_validation.py b/tests/entrypoints/openai/test_prompt_validation.py
index 1ae64ef492d5b..64a1eb6a63eef 100644
--- a/tests/entrypoints/openai/test_prompt_validation.py
+++ b/tests/entrypoints/openai/test_prompt_validation.py
@@ -1,3 +1,5 @@
+# SPDX-License-Identifier: Apache-2.0
+
 # imports for guided decoding tests
 import re
 
diff --git a/tests/entrypoints/openai/test_rerank.py b/tests/entrypoints/openai/test_rerank.py
new file mode 100644
index 0000000000000..4c9774a7397de
--- /dev/null
+++ b/tests/entrypoints/openai/test_rerank.py
@@ -0,0 +1,89 @@
+# SPDX-License-Identifier: Apache-2.0
+
+import pytest
+import requests
+
+from vllm.entrypoints.openai.protocol import RerankResponse
+
+from ...utils import RemoteOpenAIServer
+
+MODEL_NAME = "BAAI/bge-reranker-base"
+
+
+@pytest.fixture(scope="module")
+def server():
+    args = ["--enforce-eager", "--max-model-len", "100"]
+
+    with RemoteOpenAIServer(MODEL_NAME, args) as remote_server:
+        yield remote_server
+
+
+@pytest.mark.asyncio
+@pytest.mark.parametrize("model_name", [MODEL_NAME])
+def test_rerank_texts(server: RemoteOpenAIServer, model_name: str):
+    query = "What is the capital of France?"
+    documents = [
+        "The capital of Brazil is Brasilia.", "The capital of France is Paris."
+    ]
+
+    rerank_response = requests.post(server.url_for("rerank"),
+                                    json={
+                                        "model": model_name,
+                                        "query": query,
+                                        "documents": documents,
+                                    })
+    rerank_response.raise_for_status()
+    rerank = RerankResponse.model_validate(rerank_response.json())
+
+    assert rerank.id is not None
+    assert rerank.results is not None
+    assert len(rerank.results) == 2
+    assert rerank.results[0].relevance_score >= 0.9
+    assert rerank.results[1].relevance_score <= 0.01
+
+
+@pytest.mark.asyncio
+@pytest.mark.parametrize("model_name", [MODEL_NAME])
+def test_top_n(server: RemoteOpenAIServer, model_name: str):
+    query = "What is the capital of France?"
+    documents = [
+        "The capital of Brazil is Brasilia.",
+        "The capital of France is Paris.", "Cross-encoder models are neat"
+    ]
+
+    rerank_response = requests.post(server.url_for("rerank"),
+                                    json={
+                                        "model": model_name,
+                                        "query": query,
+                                        "documents": documents,
+                                        "top_n": 2
+                                    })
+    rerank_response.raise_for_status()
+    rerank = RerankResponse.model_validate(rerank_response.json())
+
+    assert rerank.id is not None
+    assert rerank.results is not None
+    assert len(rerank.results) == 2
+    assert rerank.results[0].relevance_score >= 0.9
+    assert rerank.results[1].relevance_score <= 0.01
+
+
+@pytest.mark.asyncio
+@pytest.mark.parametrize("model_name", [MODEL_NAME])
+def test_rerank_max_model_len(server: RemoteOpenAIServer, model_name: str):
+
+    query = "What is the capital of France?" * 100
+    documents = [
+        "The capital of Brazil is Brasilia.", "The capital of France is Paris."
+    ]
+
+    rerank_response = requests.post(server.url_for("rerank"),
+                                    json={
+                                        "model": model_name,
+                                        "query": query,
+                                        "documents": documents
+                                    })
+    assert rerank_response.status_code == 400
+    # Assert just a small fragments of the response
+    assert "Please reduce the length of the input." in \
+        rerank_response.text
\ No newline at end of file
diff --git a/tests/entrypoints/openai/test_return_tokens_as_ids.py b/tests/entrypoints/openai/test_return_tokens_as_ids.py
index 99f6da160d6f9..9b33eddae2a83 100644
--- a/tests/entrypoints/openai/test_return_tokens_as_ids.py
+++ b/tests/entrypoints/openai/test_return_tokens_as_ids.py
@@ -1,3 +1,5 @@
+# SPDX-License-Identifier: Apache-2.0
+
 # Separate these tests out from test_completion and test_chat, because they
 # require launching a second server with a different flag. Running both servers
 # at the same time on a single node will OOM.
diff --git a/tests/entrypoints/openai/test_root_path.py b/tests/entrypoints/openai/test_root_path.py
index 20f7960619efb..ad8159afc875a 100644
--- a/tests/entrypoints/openai/test_root_path.py
+++ b/tests/entrypoints/openai/test_root_path.py
@@ -1,3 +1,5 @@
+# SPDX-License-Identifier: Apache-2.0
+
 import contextlib
 import os
 from typing import Any, List, NamedTuple
diff --git a/tests/entrypoints/openai/test_run_batch.py b/tests/entrypoints/openai/test_run_batch.py
index 097d6b1a32349..db049ee2bfd89 100644
--- a/tests/entrypoints/openai/test_run_batch.py
+++ b/tests/entrypoints/openai/test_run_batch.py
@@ -1,3 +1,6 @@
+# SPDX-License-Identifier: Apache-2.0
+
+import json
 import subprocess
 import sys
 import tempfile
@@ -21,6 +24,9 @@ INPUT_EMBEDDING_BATCH = """{"custom_id": "request-1", "method": "POST", "url": "
 {"custom_id": "request-3", "method": "POST", "url": "/v1/embeddings", "body": {"model": "intfloat/e5-mistral-7b-instruct", "input": "Hello world!"}}
 {"custom_id": "request-4", "method": "POST", "url": "/v1/embeddings", "body": {"model": "NonExistModel", "input": "Hello world!"}}"""
 
+INPUT_SCORE_BATCH = """{"custom_id": "request-1", "method": "POST", "url": "/v1/score", "body": {"model": "BAAI/bge-reranker-v2-m3", "text_1": "What is the capital of France?", "text_2": ["The capital of Brazil is Brasilia.", "The capital of France is Paris."]}}
+{"custom_id": "request-2", "method": "POST", "url": "/v1/score", "body": {"model": "BAAI/bge-reranker-v2-m3", "text_1": "What is the capital of France?", "text_2": ["The capital of Brazil is Brasilia.", "The capital of France is Paris."]}}"""
+
 
 def test_empty_file():
     with tempfile.NamedTemporaryFile(
@@ -102,3 +108,36 @@ def test_embeddings():
             # Ensure that the output format conforms to the openai api.
             # Validation should throw if the schema is wrong.
             BatchRequestOutput.model_validate_json(line)
+
+
+def test_score():
+    with tempfile.NamedTemporaryFile(
+            "w") as input_file, tempfile.NamedTemporaryFile(
+                "r") as output_file:
+        input_file.write(INPUT_SCORE_BATCH)
+        input_file.flush()
+        proc = subprocess.Popen([
+            sys.executable,
+            "-m",
+            "vllm.entrypoints.openai.run_batch",
+            "-i",
+            input_file.name,
+            "-o",
+            output_file.name,
+            "--model",
+            "BAAI/bge-reranker-v2-m3",
+        ], )
+        proc.communicate()
+        proc.wait()
+        assert proc.returncode == 0, f"{proc=}"
+
+        contents = output_file.read()
+        for line in contents.strip().split("\n"):
+            # Ensure that the output format conforms to the openai api.
+            # Validation should throw if the schema is wrong.
+            BatchRequestOutput.model_validate_json(line)
+
+            # Ensure that there is no error in the response.
+            line_dict = json.loads(line)
+            assert isinstance(line_dict, dict)
+            assert line_dict["error"] is None
diff --git a/tests/entrypoints/openai/test_score.py b/tests/entrypoints/openai/test_score.py
index 06e0f93dbe269..bcbcb5702c95e 100644
--- a/tests/entrypoints/openai/test_score.py
+++ b/tests/entrypoints/openai/test_score.py
@@ -1,3 +1,5 @@
+# SPDX-License-Identifier: Apache-2.0
+
 import pytest
 import requests
 
@@ -10,12 +12,7 @@ MODEL_NAME = "BAAI/bge-reranker-v2-m3"
 
 @pytest.fixture(scope="module")
 def server():
-    args = [
-        "--enforce-eager",
-        # Will be used on tests to compare prompt input length
-        "--max-model-len",
-        "100"
-    ]
+    args = ["--enforce-eager", "--max-model-len", "100"]
 
     with RemoteOpenAIServer(MODEL_NAME, args) as remote_server:
         yield remote_server
diff --git a/tests/entrypoints/openai/test_serving_chat.py b/tests/entrypoints/openai/test_serving_chat.py
index 85f485364a411..1e7dbaf60dc0c 100644
--- a/tests/entrypoints/openai/test_serving_chat.py
+++ b/tests/entrypoints/openai/test_serving_chat.py
@@ -1,3 +1,5 @@
+# SPDX-License-Identifier: Apache-2.0
+
 import asyncio
 from contextlib import suppress
 from dataclasses import dataclass
@@ -103,6 +105,116 @@ def test_serving_chat_should_set_correct_max_tokens():
 
     assert mock_engine.generate.call_args.args[1].max_tokens == 10
 
+    # Setting server's max_tokens in the generation_config.json
+    # lower than context_window - prompt_tokens
+    mock_model_config = MockModelConfig()
+    mock_model_config.diff_sampling_param = {
+        "max_tokens": 10  # Setting server-side max_tokens limit
+    }
+
+    # Reinitialize the engine with new settings
+    mock_engine = MagicMock(spec=MQLLMEngineClient)
+    mock_engine.get_tokenizer.return_value = get_tokenizer(MODEL_NAME)
+    mock_engine.errored = False
+
+    # Initialize the serving chat
+    models = OpenAIServingModels(engine_client=mock_engine,
+                                 base_model_paths=BASE_MODEL_PATHS,
+                                 model_config=mock_model_config)
+    serving_chat = OpenAIServingChat(mock_engine,
+                                     mock_model_config,
+                                     models,
+                                     response_role="assistant",
+                                     chat_template=CHAT_TEMPLATE,
+                                     chat_template_content_format="auto",
+                                     request_logger=None)
+
+    # Test Case 1: No max_tokens specified in request
+    req = ChatCompletionRequest(
+        model=MODEL_NAME,
+        messages=[{
+            "role": "user",
+            "content": "what is 1+1?"
+        }],
+        guided_decoding_backend="outlines",
+    )
+
+    with suppress(Exception):
+        asyncio.run(serving_chat.create_chat_completion(req))
+
+    assert mock_engine.generate.call_args.args[1].max_tokens == 10
+
+    # Test Case 2: Request's max_tokens set higher than server accepts
+    req.max_tokens = 15
+
+    with suppress(Exception):
+        asyncio.run(serving_chat.create_chat_completion(req))
+
+    assert mock_engine.generate.call_args.args[1].max_tokens == 10
+
+    # Test Case 3: Request's max_tokens set lower than server accepts
+    req.max_tokens = 5
+
+    with suppress(Exception):
+        asyncio.run(serving_chat.create_chat_completion(req))
+
+    assert mock_engine.generate.call_args.args[1].max_tokens == 5
+
+    # Setting server's max_tokens in the generation_config.json
+    # higher than context_window - prompt_tokens
+    mock_model_config = MockModelConfig()
+    mock_model_config.diff_sampling_param = {
+        "max_tokens": 200  # Setting server-side max_tokens limit
+    }
+
+    # Reinitialize the engine with new settings
+    mock_engine = MagicMock(spec=MQLLMEngineClient)
+    mock_engine.get_tokenizer.return_value = get_tokenizer(MODEL_NAME)
+    mock_engine.errored = False
+
+    # Initialize the serving chat
+    models = OpenAIServingModels(engine_client=mock_engine,
+                                 base_model_paths=BASE_MODEL_PATHS,
+                                 model_config=mock_model_config)
+    serving_chat = OpenAIServingChat(mock_engine,
+                                     mock_model_config,
+                                     models,
+                                     response_role="assistant",
+                                     chat_template=CHAT_TEMPLATE,
+                                     chat_template_content_format="auto",
+                                     request_logger=None)
+
+    # Test case 1: No max_tokens specified, defaults to context_window
+    req = ChatCompletionRequest(
+        model=MODEL_NAME,
+        messages=[{
+            "role": "user",
+            "content": "what is 1+1?"
+        }],
+        guided_decoding_backend="outlines",
+    )
+
+    with suppress(Exception):
+        asyncio.run(serving_chat.create_chat_completion(req))
+
+    assert mock_engine.generate.call_args.args[1].max_tokens == 93
+
+    # Test Case 2: Request's max_tokens set higher than server accepts
+    req.max_tokens = 100
+
+    with suppress(Exception):
+        asyncio.run(serving_chat.create_chat_completion(req))
+
+    assert mock_engine.generate.call_args.args[1].max_tokens == 93
+
+    # Test Case 3: Request's max_tokens set lower than server accepts
+    req.max_tokens = 5
+
+    with suppress(Exception):
+        asyncio.run(serving_chat.create_chat_completion(req))
+
+    assert mock_engine.generate.call_args.args[1].max_tokens == 5
+
 
 def test_serving_chat_could_load_correct_generation_config():
 
diff --git a/tests/entrypoints/openai/test_serving_models.py b/tests/entrypoints/openai/test_serving_models.py
index 657ea20213ec9..70ca8507a546e 100644
--- a/tests/entrypoints/openai/test_serving_models.py
+++ b/tests/entrypoints/openai/test_serving_models.py
@@ -1,3 +1,5 @@
+# SPDX-License-Identifier: Apache-2.0
+
 from http import HTTPStatus
 from unittest.mock import MagicMock
 
diff --git a/tests/entrypoints/openai/test_shutdown.py b/tests/entrypoints/openai/test_shutdown.py
index 090523a836e12..5edf85ab52f52 100644
--- a/tests/entrypoints/openai/test_shutdown.py
+++ b/tests/entrypoints/openai/test_shutdown.py
@@ -1,3 +1,5 @@
+# SPDX-License-Identifier: Apache-2.0
+
 import openai
 import pytest
 
diff --git a/tests/entrypoints/openai/test_tokenization.py b/tests/entrypoints/openai/test_tokenization.py
index b1956a8cbc9dc..663b722426c58 100644
--- a/tests/entrypoints/openai/test_tokenization.py
+++ b/tests/entrypoints/openai/test_tokenization.py
@@ -1,3 +1,5 @@
+# SPDX-License-Identifier: Apache-2.0
+
 import pytest
 import pytest_asyncio
 import requests
diff --git a/tests/entrypoints/openai/test_video.py b/tests/entrypoints/openai/test_video.py
index e73449e406739..ab9285407d2a4 100644
--- a/tests/entrypoints/openai/test_video.py
+++ b/tests/entrypoints/openai/test_video.py
@@ -1,3 +1,5 @@
+# SPDX-License-Identifier: Apache-2.0
+
 from typing import Dict, List
 
 import openai
diff --git a/tests/entrypoints/openai/test_vision.py b/tests/entrypoints/openai/test_vision.py
index 5f070ba3b12e9..029c9b038b047 100644
--- a/tests/entrypoints/openai/test_vision.py
+++ b/tests/entrypoints/openai/test_vision.py
@@ -1,3 +1,5 @@
+# SPDX-License-Identifier: Apache-2.0
+
 from typing import Dict, List
 
 import openai
diff --git a/tests/entrypoints/openai/test_vision_embedding.py b/tests/entrypoints/openai/test_vision_embedding.py
index c851539c610ec..f2ff4a0b07a5f 100644
--- a/tests/entrypoints/openai/test_vision_embedding.py
+++ b/tests/entrypoints/openai/test_vision_embedding.py
@@ -1,3 +1,5 @@
+# SPDX-License-Identifier: Apache-2.0
+
 from typing import Dict
 
 import pytest
diff --git a/tests/entrypoints/openai/tool_parsers/test_pythonic_tool_parser.py b/tests/entrypoints/openai/tool_parsers/test_pythonic_tool_parser.py
index 47b0b6bb80ffe..788efa86b1093 100644
--- a/tests/entrypoints/openai/tool_parsers/test_pythonic_tool_parser.py
+++ b/tests/entrypoints/openai/tool_parsers/test_pythonic_tool_parser.py
@@ -1,3 +1,5 @@
+# SPDX-License-Identifier: Apache-2.0
+
 from typing import List
 from unittest.mock import MagicMock
 
diff --git a/tests/entrypoints/openai/tool_parsers/utils.py b/tests/entrypoints/openai/tool_parsers/utils.py
index f0a2a32c16786..57ec9865355d9 100644
--- a/tests/entrypoints/openai/tool_parsers/utils.py
+++ b/tests/entrypoints/openai/tool_parsers/utils.py
@@ -1,3 +1,5 @@
+# SPDX-License-Identifier: Apache-2.0
+
 from typing import Iterable, List, Tuple, Union
 
 from vllm.entrypoints.openai.protocol import (ChatCompletionRequest,
diff --git a/tests/entrypoints/test_chat_utils.py b/tests/entrypoints/test_chat_utils.py
index 513b466c10d60..737f733092b6d 100644
--- a/tests/entrypoints/test_chat_utils.py
+++ b/tests/entrypoints/test_chat_utils.py
@@ -1,3 +1,5 @@
+# SPDX-License-Identifier: Apache-2.0
+
 import warnings
 from typing import Optional
 
diff --git a/tests/kernels/allclose_default.py b/tests/kernels/allclose_default.py
index 175cfe82fb74e..97ceffab4eb88 100644
--- a/tests/kernels/allclose_default.py
+++ b/tests/kernels/allclose_default.py
@@ -1,3 +1,5 @@
+# SPDX-License-Identifier: Apache-2.0
+
 import torch
 
 # Reference default values of atol and rtol are from
diff --git a/tests/kernels/conftest.py b/tests/kernels/conftest.py
index 4f2f9cc3dac7d..4f04ec9475329 100644
--- a/tests/kernels/conftest.py
+++ b/tests/kernels/conftest.py
@@ -1,3 +1,5 @@
+# SPDX-License-Identifier: Apache-2.0
+
 import pytest
 
 from vllm.utils import (create_kv_caches_with_random,
diff --git a/tests/kernels/quant_utils.py b/tests/kernels/quant_utils.py
index f2358940fc7b8..34dcf91c76664 100644
--- a/tests/kernels/quant_utils.py
+++ b/tests/kernels/quant_utils.py
@@ -1,3 +1,5 @@
+# SPDX-License-Identifier: Apache-2.0
+
 from typing import Optional, Tuple, Union
 
 import torch
diff --git a/tests/kernels/test_activation.py b/tests/kernels/test_activation.py
index dac26efe866b8..2e70b1db35c45 100644
--- a/tests/kernels/test_activation.py
+++ b/tests/kernels/test_activation.py
@@ -1,3 +1,5 @@
+# SPDX-License-Identifier: Apache-2.0
+
 import random
 from typing import Type
 
diff --git a/tests/kernels/test_aqlm.py b/tests/kernels/test_aqlm.py
index 860fb66b17354..7d36172815b78 100644
--- a/tests/kernels/test_aqlm.py
+++ b/tests/kernels/test_aqlm.py
@@ -1,3 +1,5 @@
+# SPDX-License-Identifier: Apache-2.0
+
 import torch
 
 from tests.kernels.utils import opcheck
diff --git a/tests/kernels/test_attention.py b/tests/kernels/test_attention.py
index 574a0f223ef0d..b667d8d9e0307 100644
--- a/tests/kernels/test_attention.py
+++ b/tests/kernels/test_attention.py
@@ -1,3 +1,5 @@
+# SPDX-License-Identifier: Apache-2.0
+
 import random
 from typing import List, Optional, Tuple
 
diff --git a/tests/kernels/test_attention_selector.py b/tests/kernels/test_attention_selector.py
index 492acb91e8ed9..0e87437312eaa 100644
--- a/tests/kernels/test_attention_selector.py
+++ b/tests/kernels/test_attention_selector.py
@@ -1,3 +1,5 @@
+# SPDX-License-Identifier: Apache-2.0
+
 from unittest.mock import Mock, patch
 
 import pytest
diff --git a/tests/kernels/test_awq.py b/tests/kernels/test_awq.py
index aa7a430850f9a..ace75a3361734 100644
--- a/tests/kernels/test_awq.py
+++ b/tests/kernels/test_awq.py
@@ -1,3 +1,5 @@
+# SPDX-License-Identifier: Apache-2.0
+
 import os
 
 import pytest
diff --git a/tests/kernels/test_awq_marlin.py b/tests/kernels/test_awq_marlin.py
index 238d6426bf099..67595010cb2a5 100644
--- a/tests/kernels/test_awq_marlin.py
+++ b/tests/kernels/test_awq_marlin.py
@@ -1,3 +1,4 @@
+# SPDX-License-Identifier: Apache-2.0
 """Test AWQ with fused MoE Marlin kernels.
 
 Run `pytest tests/kernels/test_awq_marlin.py`.
diff --git a/tests/kernels/test_awq_triton.py b/tests/kernels/test_awq_triton.py
index 406a0c8dd8080..3fc3feaf4972c 100644
--- a/tests/kernels/test_awq_triton.py
+++ b/tests/kernels/test_awq_triton.py
@@ -1,3 +1,4 @@
+# SPDX-License-Identifier: Apache-2.0
 """Tests for the AWQ Triton kernel.
 
 Run `pytest tests/kernels/test_awq_triton.py`.
diff --git a/tests/kernels/test_block_fp8.py b/tests/kernels/test_block_fp8.py
index a16cc4582a180..20eff1c207239 100644
--- a/tests/kernels/test_block_fp8.py
+++ b/tests/kernels/test_block_fp8.py
@@ -1,3 +1,5 @@
+# SPDX-License-Identifier: Apache-2.0
+
 # Adapted from https://github.com/sgl-project/sglang/pull/2575
 import itertools
 
@@ -92,8 +94,10 @@ def native_w8a8_block_fp8_matmul(A,
         A[:, i * block_k:min((i + 1) * block_k, K)] for i in range(k_tiles)
     ]
     B_tiles = [[
-        B[j * block_n:min((j + 1) * block_n, N),
-          i * block_k:min((i + 1) * block_k, K), ] for i in range(k_tiles)
+        B[
+            j * block_n:min((j + 1) * block_n, N),
+            i * block_k:min((i + 1) * block_k, K),
+        ] for i in range(k_tiles)
     ] for j in range(n_tiles)]
     C_tiles = [
         C[:, j * block_n:min((j + 1) * block_n, N)] for j in range(n_tiles)
@@ -157,9 +161,9 @@ def setup_cuda():
     torch.set_default_device("cuda")
 
 
-@pytest.mark.parametrize("num_tokens,d,dtype,group_size,seed",
-                         itertools.product(NUM_TOKENS, D, DTYPES, GROUP_SIZE,
-                                           SEEDS))
+@pytest.mark.parametrize(
+    "num_tokens,d,dtype,group_size,seed",
+    itertools.product(NUM_TOKENS, D, DTYPES, GROUP_SIZE, SEEDS))
 @torch.inference_mode()
 def test_per_token_group_quant_fp8(num_tokens, d, dtype, group_size, seed):
     torch.manual_seed(seed)
@@ -174,9 +178,9 @@ def test_per_token_group_quant_fp8(num_tokens, d, dtype, group_size, seed):
     assert torch.allclose(scale, ref_scale)
 
 
-@pytest.mark.parametrize("M,N,K,block_size,out_dtype,seed",
-                         itertools.product(M, N, K, BLOCK_SIZE, OUT_DTYPES,
-                                           SEEDS))
+@pytest.mark.parametrize(
+    "M,N,K,block_size,out_dtype,seed",
+    itertools.product(M, N, K, BLOCK_SIZE, OUT_DTYPES, SEEDS))
 @torch.inference_mode()
 def test_w8a8_block_fp8_matmul(M, N, K, block_size, out_dtype, seed):
     torch.manual_seed(seed)
@@ -207,9 +211,10 @@ def test_w8a8_block_fp8_matmul(M, N, K, block_size, out_dtype, seed):
     assert rel_diff < 0.001
 
 
-@pytest.mark.parametrize("M,N,K,E,topk,block_size,dtype,seed",
-                         itertools.product(M_moe, N_moe, K_moe, E, TOP_KS,
-                                           BLOCK_SIZE, DTYPES, SEEDS))
+@pytest.mark.parametrize(
+    "M,N,K,E,topk,block_size,dtype,seed",
+    itertools.product(M_moe, N_moe, K_moe, E, TOP_KS, BLOCK_SIZE, DTYPES,
+                      SEEDS))
 @torch.inference_mode()
 def test_w8a8_block_fp8_fused_moe(M, N, K, E, topk, block_size, dtype, seed):
     torch.manual_seed(seed)
diff --git a/tests/kernels/test_blocksparse_attention.py b/tests/kernels/test_blocksparse_attention.py
index 08f31219e3574..e653d34d00ee1 100644
--- a/tests/kernels/test_blocksparse_attention.py
+++ b/tests/kernels/test_blocksparse_attention.py
@@ -1,3 +1,5 @@
+# SPDX-License-Identifier: Apache-2.0
+
 import random
 from typing import List, Optional, Tuple
 
diff --git a/tests/kernels/test_cache.py b/tests/kernels/test_cache.py
index c848be4f9d807..6f909b6803d3e 100644
--- a/tests/kernels/test_cache.py
+++ b/tests/kernels/test_cache.py
@@ -1,3 +1,5 @@
+# SPDX-License-Identifier: Apache-2.0
+
 import random
 from typing import List, Tuple
 
diff --git a/tests/kernels/test_cascade_flash_attn.py b/tests/kernels/test_cascade_flash_attn.py
old mode 100644
new mode 100755
index 00eb927205d46..8cc1a6a1b49f3
--- a/tests/kernels/test_cascade_flash_attn.py
+++ b/tests/kernels/test_cascade_flash_attn.py
@@ -1,3 +1,5 @@
+# SPDX-License-Identifier: Apache-2.0
+
 from typing import List, Optional, Tuple
 
 import pytest
@@ -6,7 +8,9 @@ import torch
 from vllm.platforms import current_platform
 from vllm.v1.attention.backends.flash_attn import (cascade_attention,
                                                    merge_attn_states)
-from vllm.vllm_flash_attn import flash_attn_varlen_func
+from vllm.vllm_flash_attn import (fa_version_unsupported_reason,
+                                  flash_attn_varlen_func,
+                                  is_fa_version_supported)
 
 NUM_HEADS = [(4, 4), (8, 2), (16, 2)]
 HEAD_SIZES = [128, 192, 256]
@@ -91,10 +95,9 @@ def test_cascade(
     fa_version: int,
 ) -> None:
     torch.set_default_device("cuda")
-    if fa_version == 3 and (torch.cuda.get_device_capability() == (8, 6)
-                            or torch.cuda.get_device_capability() == (8, 9)):
-        pytest.skip("Flash attention version 3 fails on 8.6 and 8.9 due to "
-                    "insufficient shared memory for some shapes")
+    if not is_fa_version_supported(fa_version):
+        pytest.skip(f"Flash attention version {fa_version} not supported due "
+                    f"to: \"{fa_version_unsupported_reason(fa_version)}\"")
 
     current_platform.seed_everything(0)
 
diff --git a/tests/kernels/test_causal_conv1d.py b/tests/kernels/test_causal_conv1d.py
index 51be2425d7dd7..93064e23dd7d1 100644
--- a/tests/kernels/test_causal_conv1d.py
+++ b/tests/kernels/test_causal_conv1d.py
@@ -1,3 +1,5 @@
+# SPDX-License-Identifier: Apache-2.0
+
 from typing import Optional
 
 import pytest
diff --git a/tests/kernels/test_cutlass.py b/tests/kernels/test_cutlass.py
index afe53797322f9..49fd8ed634f16 100644
--- a/tests/kernels/test_cutlass.py
+++ b/tests/kernels/test_cutlass.py
@@ -1,8 +1,9 @@
+# SPDX-License-Identifier: Apache-2.0
 """Tests for cutlass kernels
 
 Run `pytest tests/kernels/test_cutlass.py`.
 """
-from typing import Optional, Type
+from typing import Type
 
 import pytest
 import torch
@@ -10,6 +11,9 @@ import torch
 from tests.kernels.utils import opcheck
 from vllm import _custom_ops as ops
 from vllm.platforms import current_platform
+from vllm.utils import cdiv
+
+from .utils import baseline_scaled_mm, to_fp8, to_int8
 
 MNK_FACTORS = [
     (1, 256, 128),
@@ -37,43 +41,35 @@ CUDA_DEVICES = [
     f"cuda:{i}" for i in range(1 if torch.cuda.device_count() == 1 else 2)
 ]
 
+# -1 means full extent in that dimension
+TENSORWISE_GROUP_SHAPE = (-1, -1)
+PER_TOKEN_GROUP_SHAPE = (1, -1)
+PER_OUT_CH_GROUP_SHAPE = (-1, 1)
+
 capability = current_platform.get_device_capability()
 capability = capability[0] * 10 + capability[1]
 
 
-def to_fp8(tensor: torch.Tensor):
-    finfo = torch.finfo(torch.float8_e4m3fn)
-    return torch.round(tensor.clamp(
-        min=finfo.min, max=finfo.max)).to(dtype=torch.float8_e4m3fn)
-
-
-def to_int8(tensor: torch.Tensor):
-    return torch.round(tensor.clamp(min=-128, max=127)).to(dtype=torch.int8)
-
-
 def rand_int8(shape: tuple, device: str = "cuda"):
     return to_int8(torch.rand(shape, device=device) * 255 - 128)
 
 
-def baseline_scaled_mm(a: torch.Tensor,
-                       b: torch.Tensor,
-                       scale_a: torch.Tensor,
-                       scale_b: torch.Tensor,
-                       out_dtype: Type[torch.dtype],
-                       bias: Optional[torch.Tensor] = None) -> torch.Tensor:
-    output = (scale_a * (scale_b * (torch.mm(
-        a.to(dtype=torch.float32), b.to(dtype=torch.float32))))).to(out_dtype)
-    if bias is not None:
-        output = output + bias
+def group_scale_helper(shape, group_shape):
+    return [shape[i] if s < 0 else s for i, s in enumerate(group_shape)]
 
-    return output
+
+def scale_shape(shape, group_shape):
+    assert len(shape) == len(group_shape)
+    group_shape = group_scale_helper(shape, group_shape)
+    return tuple(
+        cdiv(shape[i], group_shape[i]) for i in range(len(group_shape)))
 
 
 def cutlass_fp8_gemm_helper(m: int,
                             n: int,
                             k: int,
-                            per_token_act_quant: bool,
-                            per_out_channel_weight_quant: bool,
+                            a_scale_group_shape: tuple,
+                            b_scale_group_shape: tuple,
                             use_bias: bool,
                             out_dtype: Type[torch.dtype] = torch.bfloat16,
                             device: str = "cuda"):
@@ -82,13 +78,17 @@ def cutlass_fp8_gemm_helper(m: int,
     a = to_fp8(torch.randn((m, k), device=device))
     b = to_fp8(torch.randn((n, k), device=device).t())
 
-    m_a_scales = m if per_token_act_quant else 1
-    n_b_scales = n if per_out_channel_weight_quant else 1
+    a_scales_shape = scale_shape(a.shape, a_scale_group_shape)
+    b_scales_shape = scale_shape(b.shape, b_scale_group_shape)
+
+    scale_a = (torch.randn(a_scales_shape, device=device, dtype=torch.float32))
+    scale_b = (torch.randn(b_scales_shape, device=device, dtype=torch.float32))
+
+    # make scales M-major for blockwise quant, doesn't affect 1D scales
+    scale_a = scale_a.t().contiguous().t()
+    # make scales K-major for blockwise quant, doesn't affect 1D scales
+    scale_b = scale_b.t().contiguous().t()
 
-    scale_a = (torch.randn((m_a_scales, 1), device=device,
-                           dtype=torch.float32))
-    scale_b = (torch.randn((1, n_b_scales), device=device,
-                           dtype=torch.float32))
     if use_bias:
         bias = torch.rand((n, ), device=device, dtype=out_dtype) * 10
     else:
@@ -106,8 +106,8 @@ def cutlass_fp8_gemm_helper(m: int,
 def cutlass_int8_gemm_helper(m: int,
                              n: int,
                              k: int,
-                             per_token_act_quant: bool,
-                             per_out_channel_weight_quant: bool,
+                             a_scale_group_shape: tuple,
+                             b_scale_group_shape: tuple,
                              use_bias: bool,
                              out_dtype: Type[torch.dtype] = torch.bfloat16,
                              device: str = "cuda"):
@@ -116,13 +116,11 @@ def cutlass_int8_gemm_helper(m: int,
     a = to_int8(torch.randn((m, k), device=device) * 5)
     b = to_int8(torch.randn((n, k), device=device).t() * 5)
 
-    m_a_scales = m if per_token_act_quant else 1
-    n_b_scales = n if per_out_channel_weight_quant else 1
+    a_scales_shape = scale_shape(a.shape, a_scale_group_shape)
+    b_scales_shape = scale_shape(b.shape, b_scale_group_shape)
 
-    scale_a = (torch.randn((m_a_scales, 1), device=device,
-                           dtype=torch.float32))
-    scale_b = (torch.randn((1, n_b_scales), device=device,
-                           dtype=torch.float32))
+    scale_a = (torch.randn(a_scales_shape, device=device, dtype=torch.float32))
+    scale_b = (torch.randn(b_scales_shape, device=device, dtype=torch.float32))
 
     if use_bias:
         bias = torch.rand((n, ), device=device, dtype=out_dtype) * 10
@@ -139,82 +137,135 @@ def cutlass_int8_gemm_helper(m: int,
 
 
 @pytest.mark.parametrize("m,n,k", MNK_FACTORS)
-@pytest.mark.parametrize("per_act_token", [True, False])
-@pytest.mark.parametrize("per_out_ch", [True, False])
+@pytest.mark.parametrize("a_scale_group_shape",
+                         [PER_TOKEN_GROUP_SHAPE, TENSORWISE_GROUP_SHAPE])
+@pytest.mark.parametrize("b_scale_group_shape",
+                         [PER_OUT_CH_GROUP_SHAPE, TENSORWISE_GROUP_SHAPE])
 @pytest.mark.parametrize("use_bias", [True, False])
 @pytest.mark.skipif(not current_platform.has_device_capability(89),
                     reason="FP8 is not supported on this GPU type.")
-def test_cutlass_fp8_gemm(m: int, n: int, k: int, per_act_token: bool,
-                          per_out_ch: bool, use_bias: bool):
-    cutlass_fp8_gemm_helper(m, n, k, per_act_token, per_out_ch, use_bias)
+def test_cutlass_fp8_gemm(m: int, n: int, k: int, a_scale_group_shape,
+                          b_scale_group_shape, use_bias: bool):
+    cutlass_fp8_gemm_helper(m, n, k, a_scale_group_shape, b_scale_group_shape,
+                            use_bias)
 
 
 @pytest.mark.parametrize("m,n,k", MNK_FACTORS)
-@pytest.mark.parametrize("per_act_token", [True, False])
-@pytest.mark.parametrize("per_out_ch", [True, False])
+@pytest.mark.parametrize("a_scale_group_shape,b_scale_group_shape",
+                         [((1, 128), (128, 128))])
+@pytest.mark.parametrize("use_bias", [False])
+@pytest.mark.skipif(not current_platform.has_device_capability(90),
+                    reason="FP8 blockwise is not supported on this GPU type.")
+def test_cutlass_fp8_blockwise_scale_gemm(m: int, n: int, k: int,
+                                          a_scale_group_shape,
+                                          b_scale_group_shape, use_bias: bool):
+    if k % b_scale_group_shape[0] != 0 or n % b_scale_group_shape[1] != 0:
+        return
+    if m % a_scale_group_shape[0] != 0 or k % a_scale_group_shape[1] != 0:
+        return
+    cutlass_fp8_gemm_helper(m, n, k, a_scale_group_shape, b_scale_group_shape,
+                            use_bias)
+
+
+@pytest.mark.parametrize("m,n,k", MNK_FACTORS)
+@pytest.mark.parametrize("a_scale_group_shape",
+                         [PER_TOKEN_GROUP_SHAPE, TENSORWISE_GROUP_SHAPE])
+@pytest.mark.parametrize("b_scale_group_shape",
+                         [PER_OUT_CH_GROUP_SHAPE, TENSORWISE_GROUP_SHAPE])
 @pytest.mark.parametrize("use_bias", [True, False])
-def test_cutlass_int8_gemm(m: int, n: int, k: int, per_act_token: bool,
-                           per_out_ch: bool, use_bias: bool):
-    cutlass_int8_gemm_helper(m, n, k, per_act_token, per_out_ch, use_bias)
+def test_cutlass_int8_gemm(m: int, n: int, k: int, a_scale_group_shape,
+                           b_scale_group_shape, use_bias: bool):
+    cutlass_int8_gemm_helper(m, n, k, a_scale_group_shape, b_scale_group_shape,
+                             use_bias)
 
 
-@pytest.mark.parametrize("per_act_token", [True, False])
-@pytest.mark.parametrize("per_out_ch", [True, False])
+@pytest.mark.parametrize("a_scale_group_shape",
+                         [PER_TOKEN_GROUP_SHAPE, TENSORWISE_GROUP_SHAPE])
+@pytest.mark.parametrize("b_scale_group_shape",
+                         [PER_OUT_CH_GROUP_SHAPE, TENSORWISE_GROUP_SHAPE])
 @pytest.mark.parametrize("out_dtype", [torch.bfloat16, torch.float16])
 @pytest.mark.parametrize("use_bias", [True, False])
-def test_cutlass_int8_gemm_output_dtype(per_act_token: bool, per_out_ch: bool,
+def test_cutlass_int8_gemm_output_dtype(a_scale_group_shape,
+                                        b_scale_group_shape,
                                         out_dtype: Type[torch.dtype],
                                         use_bias: bool):
     cutlass_int8_gemm_helper(512,
                              512,
                              512,
-                             per_act_token,
-                             per_out_ch,
+                             a_scale_group_shape,
+                             b_scale_group_shape,
                              use_bias,
                              out_dtype=out_dtype)
 
 
-@pytest.mark.parametrize("per_act_token", [True, False])
-@pytest.mark.parametrize("per_out_ch", [True, False])
+@pytest.mark.parametrize("a_scale_group_shape",
+                         [PER_TOKEN_GROUP_SHAPE, TENSORWISE_GROUP_SHAPE])
+@pytest.mark.parametrize("b_scale_group_shape",
+                         [PER_OUT_CH_GROUP_SHAPE, TENSORWISE_GROUP_SHAPE])
 @pytest.mark.parametrize("out_dtype", [torch.bfloat16, torch.float16])
 @pytest.mark.parametrize("use_bias", [True, False])
 @pytest.mark.skipif(not current_platform.has_device_capability(89),
                     reason="FP8 is not supported on this GPU type.")
-def test_cutlass_fp8_gemm_output_dtype(per_act_token: bool, per_out_ch: bool,
+def test_cutlass_fp8_gemm_output_dtype(a_scale_group_shape,
+                                       b_scale_group_shape,
                                        out_dtype: Type[torch.dtype],
                                        use_bias: bool):
     cutlass_fp8_gemm_helper(512,
                             512,
                             512,
-                            per_act_token,
-                            per_out_ch,
+                            a_scale_group_shape,
+                            b_scale_group_shape,
                             use_bias,
                             out_dtype=out_dtype)
 
 
-@pytest.mark.parametrize("per_act_token", [True, False])
-@pytest.mark.parametrize("per_out_ch", [True, False])
+@pytest.mark.parametrize("a_scale_group_shape,b_scale_group_shape",
+                         [((1, 128), (128, 128))])
+@pytest.mark.parametrize("out_dtype", [torch.bfloat16, torch.float16])
+@pytest.mark.parametrize("use_bias", [False])
+@pytest.mark.skipif(not current_platform.has_device_capability(90),
+                    reason="FP8 blockwise is not supported on this GPU type.")
+def test_cutlass_fp8_blockwise_scale_gemm_dtype(a_scale_group_shape,
+                                                b_scale_group_shape,
+                                                out_dtype: Type[torch.dtype],
+                                                use_bias: bool):
+    cutlass_fp8_gemm_helper(512,
+                            512,
+                            512,
+                            a_scale_group_shape,
+                            b_scale_group_shape,
+                            use_bias,
+                            out_dtype=out_dtype)
+
+
+@pytest.mark.parametrize("a_scale_group_shape",
+                         [PER_TOKEN_GROUP_SHAPE, TENSORWISE_GROUP_SHAPE])
+@pytest.mark.parametrize("b_scale_group_shape",
+                         [PER_OUT_CH_GROUP_SHAPE, TENSORWISE_GROUP_SHAPE])
 @pytest.mark.parametrize("use_bias", [True, False])
 @pytest.mark.parametrize("device", CUDA_DEVICES)
 @pytest.mark.skipif(not current_platform.has_device_capability(89),
                     reason="FP8 is not supported on this GPU type.")
-def test_cutlass_fp8_gemm_devices(per_act_token: bool, per_out_ch: bool,
+def test_cutlass_fp8_gemm_devices(a_scale_group_shape, b_scale_group_shape,
                                   use_bias: bool, device: str):
-    cutlass_fp8_gemm_helper(512, 512, 512, per_act_token, per_out_ch, use_bias,
-                            torch.bfloat16, device)
+    cutlass_fp8_gemm_helper(512, 512, 512, a_scale_group_shape,
+                            b_scale_group_shape, use_bias, torch.bfloat16,
+                            device)
 
 
-@pytest.mark.parametrize("per_act_token", [True, False])
-@pytest.mark.parametrize("per_out_ch", [True, False])
+@pytest.mark.parametrize("a_scale_group_shape",
+                         [PER_TOKEN_GROUP_SHAPE, TENSORWISE_GROUP_SHAPE])
+@pytest.mark.parametrize("b_scale_group_shape",
+                         [PER_OUT_CH_GROUP_SHAPE, TENSORWISE_GROUP_SHAPE])
 @pytest.mark.parametrize("use_bias", [True, False])
 @pytest.mark.parametrize("device", CUDA_DEVICES)
-def test_cutlass_int8_gemm_devices(per_act_token: bool, per_out_ch: bool,
+def test_cutlass_int8_gemm_devices(a_scale_group_shape, b_scale_group_shape,
                                    use_bias: bool, device: str):
     cutlass_int8_gemm_helper(512,
                              512,
                              512,
-                             per_act_token,
-                             per_out_ch,
+                             a_scale_group_shape,
+                             b_scale_group_shape,
                              use_bias,
                              out_dtype=torch.bfloat16,
                              device=device)
@@ -225,28 +276,32 @@ def test_cutlass_int8_gemm_devices(per_act_token: bool, per_out_ch: bool,
 # of a large power of two. In any case, the kernel will have a naive fallback
 # when N and K are not divisible by 16. But M is the number of tokens and the
 # kernel must handle any M thrown at it.
-@pytest.mark.parametrize("per_act_token", [True, False])
-@pytest.mark.parametrize("per_out_ch", [True, False])
+@pytest.mark.parametrize("a_scale_group_shape",
+                         [PER_TOKEN_GROUP_SHAPE, TENSORWISE_GROUP_SHAPE])
+@pytest.mark.parametrize("b_scale_group_shape",
+                         [PER_OUT_CH_GROUP_SHAPE, TENSORWISE_GROUP_SHAPE])
 @pytest.mark.parametrize("use_bias", [True, False])
 @pytest.mark.skipif(not current_platform.has_device_capability(89),
                     reason="FP8 is not supported on this GPU type.")
-def test_cutlass_fp8_gemm_m_sweep(per_act_token: bool, per_out_ch: bool,
+def test_cutlass_fp8_gemm_m_sweep(a_scale_group_shape, b_scale_group_shape,
                                   use_bias: bool):
     for nk in range(32, 128, 32):
         for m in range(1, 128):
-            cutlass_fp8_gemm_helper(m, nk, nk, per_act_token, per_out_ch,
-                                    use_bias)
+            cutlass_fp8_gemm_helper(m, nk, nk, a_scale_group_shape,
+                                    b_scale_group_shape, use_bias)
 
 
-@pytest.mark.parametrize("per_act_token", [True, False])
-@pytest.mark.parametrize("per_out_ch", [True, False])
+@pytest.mark.parametrize("a_scale_group_shape",
+                         [PER_TOKEN_GROUP_SHAPE, TENSORWISE_GROUP_SHAPE])
+@pytest.mark.parametrize("b_scale_group_shape",
+                         [PER_OUT_CH_GROUP_SHAPE, TENSORWISE_GROUP_SHAPE])
 @pytest.mark.parametrize("use_bias", [True, False])
-def test_cutlass_int8_gemm_m_sweep(per_act_token: bool, per_out_ch: bool,
+def test_cutlass_int8_gemm_m_sweep(a_scale_group_shape, b_scale_group_shape,
                                    use_bias: bool):
     for nk in range(32, 128, 32):
         for m in range(1, 128):
-            cutlass_int8_gemm_helper(m, nk, nk, per_act_token, per_out_ch,
-                                     use_bias)
+            cutlass_int8_gemm_helper(m, nk, nk, a_scale_group_shape,
+                                     b_scale_group_shape, use_bias)
 
 
 @pytest.mark.parametrize("m", [32, 64, 128])
diff --git a/tests/kernels/test_cutlass_2of4_sparse.py b/tests/kernels/test_cutlass_2of4_sparse.py
new file mode 100644
index 0000000000000..4c613b75fc6f5
--- /dev/null
+++ b/tests/kernels/test_cutlass_2of4_sparse.py
@@ -0,0 +1,215 @@
+# SPDX-License-Identifier: Apache-2.0
+"""Tests for sparse cutlass kernels
+
+Run `pytest tests/kernels/test_semi_structured.py`.
+"""
+from typing import Tuple, Type
+
+import pytest
+import torch
+import torch.nn.functional as F
+
+from vllm import _custom_ops as ops
+from vllm.model_executor.layers.quantization.utils.w8a8_utils import (
+    sparse_cutlass_supported)
+from vllm.platforms import current_platform
+
+from .utils import baseline_scaled_mm, to_fp8, to_int8
+
+CUDA_DEVICES = [
+    f"cuda:{i}" for i in range(1 if torch.cuda.device_count() == 1 else 2)
+]
+
+capability = current_platform.get_device_capability()
+capability = capability[0] * 10 + capability[1]
+
+
+def to_bf16(tensor: torch.Tensor) -> torch.Tensor:
+    return tensor.to(dtype=torch.bfloat16)
+
+
+def to_fp16(tensor: torch.Tensor) -> torch.Tensor:
+    return tensor.to(dtype=torch.float16)
+
+
+def prune_to_2_4(tensor):
+    # Reshape tensor to [N, 4] where N is number of groups of 4
+    original_shape = tensor.shape
+    reshaped = tensor.reshape(-1, 4)
+
+    # Get indices of top 2 absolute values in each group of 4
+    _, indices = torch.topk(torch.abs(reshaped), k=2, dim=1)
+
+    # Create binary mask
+    mask = torch.zeros_like(reshaped)
+    mask.scatter_(dim=1,
+                  index=indices,
+                  src=torch.ones_like(indices, dtype=mask.dtype))
+
+    # Apply mask and reshape back
+    pruned = reshaped * mask
+
+    # Turn all -0.0 to 0.0
+    pruned[pruned == -0.0] = 0.0
+
+    return pruned.reshape(original_shape)
+
+
+def make_rand_sparse_tensors(
+        dtype: torch.dtype, m: int, n: int, k: int
+) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor]:
+    a = torch.randn((m, k), device='cuda') * 5
+    b = torch.randn((n, k), device='cuda').t() * 5
+
+    b = prune_to_2_4(b.t()).t()
+
+    if dtype == torch.int8:
+        a, b = to_int8(a), to_int8(b)
+    elif dtype == torch.float8_e4m3fn:
+        a, b = to_fp8(a), to_fp8(b)
+    elif dtype == torch.float16:
+        a, b = to_fp16(a), to_fp16(b)
+    elif dtype == torch.bfloat16:
+        a, b = to_bf16(a), to_bf16(b)
+    else:
+        raise ValueError("unsupported dtype")
+
+    b_compressed, e = ops.cutlass_sparse_compress(b.t())
+
+    # Compressed B, Metadata, Original A, B
+    return b_compressed, e, a, b
+
+
+@pytest.mark.skipif(not sparse_cutlass_supported(),
+                    reason="Sparse CUTLASS is not supported on this GPU type.")
+# Test working with a subset of A and B for sparse matmul
+def test_cutlass_sparse_subset():
+
+    big_m = 1024
+    m, n, k = 512, 512, 512
+
+    # Create tensors
+    b_comp, e, whole_a, b = make_rand_sparse_tensors(torch.float8_e4m3fn,
+                                                     big_m, n, k)
+    a = whole_a[0:m, 0:k]
+    scale_a = torch.randn((1, 1), device="cuda", dtype=torch.float32) / 10
+    scale_b = torch.randn((1, 1), device="cuda", dtype=torch.float32) / 10
+
+    out = ops.cutlass_scaled_sparse_mm(a,
+                                       b_comp,
+                                       e,
+                                       scale_a,
+                                       scale_b,
+                                       out_dtype=torch.bfloat16)
+    baseline = baseline_scaled_mm(a,
+                                  b,
+                                  scale_a,
+                                  scale_b,
+                                  out_dtype=torch.bfloat16)
+
+    torch.testing.assert_close(out, baseline, rtol=1e-1, atol=1e0)
+
+
+MNK_FACTORS = [
+    (1, 256, 128),
+    (1, 16384, 1024),
+    (1, 24576, 512),
+    (16, 256, 512),
+    (16, 16384, 128),
+    (16, 24576, 4096),
+    (32, 8192, 4096),
+    (32, 16384, 4096),
+    (33, 1024, 1024),
+    (33, 8192, 128),
+    (64, 2048, 512),
+    (64, 16384, 1024),
+    (100, 8192, 512),
+    (128, 32768, 4096),
+    (256, 4096, 4096),
+    (512, 256, 1024),
+    (512, 8192, 4096),
+    (512, 16384, 128),
+    (512, 24576, 128),
+]
+
+
+# Test working with a subset of A and B for sparse matmul
+@pytest.mark.skip(reason="2of4 sparse w16a16 CUTLASS produces bad output.")
+@pytest.mark.skipif(not sparse_cutlass_supported(),
+                    reason="Sparse CUTLASS is not supported on this GPU type.")
+@pytest.mark.parametrize("m, k, n", MNK_FACTORS)
+@pytest.mark.parametrize("dtype", [torch.bfloat16, torch.float16])
+def test_cutlass_sparse_gemm(m: int, k: int, n: int, dtype: Type[torch.dtype]):
+
+    # Create tensors
+    b_comp, e, a, b = make_rand_sparse_tensors(dtype, m, n, k)
+    scale_a = torch.ones((1, 1), device="cuda", dtype=torch.float32)
+    scale_b = torch.ones((1, 1), device="cuda", dtype=torch.float32)
+
+    out = ops.cutlass_scaled_sparse_mm(a,
+                                       b_comp,
+                                       e,
+                                       scale_a,
+                                       scale_b,
+                                       out_dtype=dtype)
+    baseline = F.linear(a, b.T)
+
+    torch.testing.assert_close(out, baseline, rtol=1e-2, atol=1e-2)
+
+
+@pytest.mark.skipif(not sparse_cutlass_supported(),
+                    reason="Sparse CUTLASS is not supported on this GPU type.")
+@pytest.mark.parametrize("m, k, n", MNK_FACTORS)
+@pytest.mark.skipif(not current_platform.has_device_capability(89),
+                    reason="FP8 is not supported on this GPU type.")
+def test_cutlass_sparse_fp8_gemm(m: int, n: int, k: int):
+
+    # Create tensors
+    b_comp, e, a, b = make_rand_sparse_tensors(torch.float8_e4m3fn, m, n, k)
+    scale_a = (torch.randn((1, 1), device="cuda", dtype=torch.float32))
+    scale_b = (torch.randn((1, 1), device="cuda", dtype=torch.float32))
+
+    out = ops.cutlass_scaled_sparse_mm(a,
+                                       b_comp,
+                                       e,
+                                       scale_a,
+                                       scale_b,
+                                       out_dtype=torch.bfloat16)
+
+    baseline = baseline_scaled_mm(a,
+                                  b,
+                                  scale_a,
+                                  scale_b,
+                                  out_dtype=torch.bfloat16)
+
+    torch.testing.assert_close(out, baseline, rtol=1e0, atol=2e0)
+
+
+@pytest.mark.skipif(not sparse_cutlass_supported(),
+                    reason="Sparse CUTLASS is not supported on this GPU type.")
+@pytest.mark.parametrize("m,k,n", MNK_FACTORS)
+@pytest.mark.parametrize("per_act_token", [True, False])
+@pytest.mark.parametrize("per_out_ch", [True, False])
+@pytest.mark.parametrize("use_bias", [True, False])
+def test_cutlass_sparse_int8_gemm(m: int, n: int, k: int, per_act_token: bool,
+                                  per_out_ch: bool, use_bias: bool):
+
+    # Create tensors
+    b_comp, e, a, b = make_rand_sparse_tensors(torch.int8, m, n, k)
+    scale_a = (torch.randn((1, 1), device="cuda", dtype=torch.float32))
+    scale_b = (torch.randn((1, 1), device="cuda", dtype=torch.float32))
+
+    out = ops.cutlass_scaled_sparse_mm(a,
+                                       b_comp,
+                                       e,
+                                       scale_a,
+                                       scale_b,
+                                       out_dtype=torch.bfloat16)
+
+    baseline = baseline_scaled_mm(a,
+                                  b,
+                                  scale_a,
+                                  scale_b,
+                                  out_dtype=torch.bfloat16)
+
+    torch.testing.assert_close(out, baseline, rtol=1e0, atol=2e0)
diff --git a/tests/kernels/test_encoder_decoder_attn.py b/tests/kernels/test_encoder_decoder_attn.py
index e008a56de6208..0d11e8652ce63 100644
--- a/tests/kernels/test_encoder_decoder_attn.py
+++ b/tests/kernels/test_encoder_decoder_attn.py
@@ -1,3 +1,4 @@
+# SPDX-License-Identifier: Apache-2.0
 """
 Tests:
 
diff --git a/tests/kernels/test_flash_attn.py b/tests/kernels/test_flash_attn.py
index b22153c86b25f..b8af89b660a6b 100644
--- a/tests/kernels/test_flash_attn.py
+++ b/tests/kernels/test_flash_attn.py
@@ -1,11 +1,15 @@
+# SPDX-License-Identifier: Apache-2.0
+
 from typing import List, Optional, Tuple
 
 import pytest
 import torch
 
 from vllm.platforms import current_platform
-from vllm.vllm_flash_attn import (flash_attn_varlen_func,
-                                  flash_attn_with_kvcache)
+from vllm.vllm_flash_attn import (fa_version_unsupported_reason,
+                                  flash_attn_varlen_func,
+                                  flash_attn_with_kvcache,
+                                  is_fa_version_supported)
 
 NUM_HEADS = [(4, 4), (8, 2), (16, 2)]
 HEAD_SIZES = [128, 256]
@@ -95,10 +99,9 @@ def test_flash_attn_with_paged_kv(
     fa_version: int,
 ) -> None:
     torch.set_default_device("cuda")
-    if fa_version == 3 and (torch.cuda.get_device_capability() == (8, 6)
-                            or torch.cuda.get_device_capability() == (8, 9)):
-        pytest.skip("Flash attention version 3 fails on 8.6 and 8.9 due to "
-                    "insufficient shared memory for some shapes")
+    if not is_fa_version_supported(fa_version):
+        pytest.skip(f"Flash attention version {fa_version} not supported due "
+                    f"to: \"{fa_version_unsupported_reason(fa_version)}\"")
 
     current_platform.seed_everything(0)
     num_seqs = len(kv_lens)
@@ -182,11 +185,9 @@ def test_varlen_with_paged_kv(
     fa_version: int,
 ) -> None:
     torch.set_default_device("cuda")
-    if fa_version == 3 and (torch.cuda.get_device_capability() == (8, 6)
-                            or torch.cuda.get_device_capability() == (8, 9)):
-        pytest.skip("Flash attention version 3 fails on 8.6 and 8.9 due to "
-                    "insufficient shared memory for some shapes")
-
+    if not is_fa_version_supported(fa_version):
+        pytest.skip(f"Flash attention version {fa_version} not supported due "
+                    f"to: \"{fa_version_unsupported_reason(fa_version)}\"")
     current_platform.seed_everything(0)
     num_seqs = len(seq_lens)
     query_lens = [x[0] for x in seq_lens]
diff --git a/tests/kernels/test_flashinfer.py b/tests/kernels/test_flashinfer.py
index a2c8f71665737..212ceb5e41746 100644
--- a/tests/kernels/test_flashinfer.py
+++ b/tests/kernels/test_flashinfer.py
@@ -1,3 +1,5 @@
+# SPDX-License-Identifier: Apache-2.0
+
 from typing import List, Optional, Tuple
 
 import flashinfer
@@ -133,17 +135,19 @@ def test_flashinfer_decode_with_paged_kv(
                 use_tensor_cores=(
                     (num_query_heads//num_kv_heads) > 4)
                 )
-    wrapper.begin_forward(kv_indptr,
-                          kv_indices,
-                          kv_last_page_lens,
-                          num_query_heads,
-                          num_kv_heads,
-                          head_size,
-                          block_size,
-                          "NONE",
-                          data_type=dtype)
+    wrapper.plan(kv_indptr,
+                 kv_indices,
+                 kv_last_page_lens,
+                 num_query_heads,
+                 num_kv_heads,
+                 head_size,
+                 block_size,
+                 "NONE",
+                 q_data_type=dtype,
+                 kv_data_type=dtype,
+                 logits_soft_cap=soft_cap)
 
-    output = wrapper.forward(query, key_value_cache, logits_soft_cap=soft_cap)
+    output = wrapper.run(query, key_value_cache)
 
     ref_output = ref_paged_attn(query=query,
                                 key_cache=key_cache,
@@ -228,7 +232,7 @@ def test_flashinfer_prefill_with_paged_kv(seq_lens: List[Tuple[int, int]],
     workspace_buffer = torch.empty(128 * 1024 * 1024, dtype=torch.int8)
     wrapper = flashinfer.BatchPrefillWithPagedKVCacheWrapper(
         workspace_buffer, "NHD")
-    wrapper.begin_forward(
+    wrapper.plan(
         qo_indptr,
         kv_indptr,
         kv_indices,
@@ -237,12 +241,14 @@ def test_flashinfer_prefill_with_paged_kv(seq_lens: List[Tuple[int, int]],
         num_kv_heads,
         head_size,
         block_size,
+        q_data_type=dtype,
+        kv_data_type=dtype,
+        logits_soft_cap=soft_cap,
     )
 
-    output = wrapper.forward(
+    output = wrapper.run(
         query,
         key_value_cache,
-        logits_soft_cap=soft_cap,
     )
 
     ref_output = ref_paged_attn(query=query,
@@ -253,7 +259,7 @@ def test_flashinfer_prefill_with_paged_kv(seq_lens: List[Tuple[int, int]],
                                 block_tables=block_tables,
                                 scale=scale,
                                 soft_cap=soft_cap)
-    torch.testing.assert_close(output, ref_output, atol=1e-2, rtol=1e-2), \
+    torch.testing.assert_close(output, ref_output, atol=5e-2, rtol=1e-2), \
         f"{torch.max(torch.abs(output - ref_output))}"
 
 
@@ -332,7 +338,7 @@ def test_flashinfer_prefill_with_paged_fp8_kv(
     workspace_buffer = torch.empty(128 * 1024 * 1024, dtype=torch.int8)
     wrapper = flashinfer.BatchPrefillWithPagedKVCacheWrapper(
         workspace_buffer, "NHD")
-    wrapper.begin_forward(
+    wrapper.plan(
         qo_indptr,
         kv_indptr,
         kv_indices,
@@ -341,13 +347,12 @@ def test_flashinfer_prefill_with_paged_fp8_kv(
         num_kv_heads,
         head_size,
         block_size,
+        q_data_type=dtype,
+        kv_data_type=kv_cache_dtype,
+        logits_soft_cap=soft_cap,
     )
 
-    output = wrapper.forward(query,
-                             kv_cache_fp8,
-                             logits_soft_cap=soft_cap,
-                             k_scale=k_scale,
-                             v_scale=v_scale)
+    output = wrapper.run(query, kv_cache_fp8, k_scale=k_scale, v_scale=v_scale)
 
     ref_output = ref_paged_attn(query=query,
                                 key_cache=key_cache.squeeze(1),
@@ -360,7 +365,7 @@ def test_flashinfer_prefill_with_paged_fp8_kv(
     del query
     del block_tables
     # verify prefill fp8
-    torch.testing.assert_close(output, ref_output, atol=1e-2, rtol=1e-2), \
+    torch.testing.assert_close(output, ref_output, atol=5e-2, rtol=1e-2), \
         f"{torch.max(torch.abs(output - ref_output))}"
 
 
@@ -439,21 +444,18 @@ def test_flashinfer_decode_with_paged_fp8_kv(
     wrapper = flashinfer.\
         BatchDecodeWithPagedKVCacheWrapper(workspace_buffer, "NHD",
                     use_tensor_cores=use_tensor_cores)
-    wrapper.begin_forward(kv_indptr,
-                          kv_indices,
-                          kv_last_page_lens,
-                          num_query_heads,
-                          num_kv_heads,
-                          head_size,
-                          block_size,
-                          "NONE",
-                          data_type=dtype,
-                          q_data_type=dtype)
-    output = wrapper.forward(query,
-                             kv_cache_fp8,
-                             logits_soft_cap=soft_cap,
-                             k_scale=k_scale,
-                             v_scale=v_scale)
+    wrapper.plan(kv_indptr,
+                 kv_indices,
+                 kv_last_page_lens,
+                 num_query_heads,
+                 num_kv_heads,
+                 head_size,
+                 block_size,
+                 "NONE",
+                 q_data_type=dtype,
+                 kv_data_type=kv_cache_dtype,
+                 logits_soft_cap=soft_cap)
+    output = wrapper.run(query, kv_cache_fp8, k_scale=k_scale, v_scale=v_scale)
     key_cache = key_value_cache[:, 0, :, :, :].squeeze(1)
     value_cache = key_value_cache[:, 1, :, :, :].squeeze(1)
 
diff --git a/tests/kernels/test_fp8_quant.py b/tests/kernels/test_fp8_quant.py
index ebaaae2321885..876cf03fd644c 100644
--- a/tests/kernels/test_fp8_quant.py
+++ b/tests/kernels/test_fp8_quant.py
@@ -1,3 +1,5 @@
+# SPDX-License-Identifier: Apache-2.0
+
 import pytest
 import torch
 
diff --git a/tests/kernels/test_fused_quant_layernorm.py b/tests/kernels/test_fused_quant_layernorm.py
index baf8d73fdbffb..d4b674b235340 100644
--- a/tests/kernels/test_fused_quant_layernorm.py
+++ b/tests/kernels/test_fused_quant_layernorm.py
@@ -1,3 +1,5 @@
+# SPDX-License-Identifier: Apache-2.0
+
 from typing import Optional, Tuple, Union
 
 import pytest
diff --git a/tests/kernels/test_ggml.py b/tests/kernels/test_ggml.py
index dddb285bf26ec..dc728fd4861df 100644
--- a/tests/kernels/test_ggml.py
+++ b/tests/kernels/test_ggml.py
@@ -1,3 +1,5 @@
+# SPDX-License-Identifier: Apache-2.0
+
 import gguf
 import pytest
 import torch
diff --git a/tests/kernels/test_gguf.py b/tests/kernels/test_gguf.py
index 893af99ba4977..847ca9f431054 100644
--- a/tests/kernels/test_gguf.py
+++ b/tests/kernels/test_gguf.py
@@ -1,3 +1,5 @@
+# SPDX-License-Identifier: Apache-2.0
+
 from pathlib import Path
 from typing import List
 
diff --git a/tests/kernels/test_gptq.py b/tests/kernels/test_gptq.py
index c1ca6f1f5191b..fea013d9e5795 100644
--- a/tests/kernels/test_gptq.py
+++ b/tests/kernels/test_gptq.py
@@ -1,3 +1,5 @@
+# SPDX-License-Identifier: Apache-2.0
+
 import torch
 
 from tests.kernels.utils import opcheck
diff --git a/tests/kernels/test_int8_quant.py b/tests/kernels/test_int8_quant.py
index 761eb95c423fc..25dcb587e4878 100644
--- a/tests/kernels/test_int8_quant.py
+++ b/tests/kernels/test_int8_quant.py
@@ -1,3 +1,5 @@
+# SPDX-License-Identifier: Apache-2.0
+
 import pytest
 import torch
 
diff --git a/tests/kernels/test_layernorm.py b/tests/kernels/test_layernorm.py
index 727769e071842..fa4bbe458645f 100644
--- a/tests/kernels/test_layernorm.py
+++ b/tests/kernels/test_layernorm.py
@@ -1,3 +1,5 @@
+# SPDX-License-Identifier: Apache-2.0
+
 import pytest
 import torch
 
diff --git a/tests/kernels/test_machete_mm.py b/tests/kernels/test_machete_mm.py
index 1c6eb2dd9a228..bd60526ed9b76 100644
--- a/tests/kernels/test_machete_mm.py
+++ b/tests/kernels/test_machete_mm.py
@@ -1,3 +1,4 @@
+# SPDX-License-Identifier: Apache-2.0
 """Tests for the machete kernel.
 
 Run `pytest tests/kernels/test_machete_mm.py`.
diff --git a/tests/kernels/test_mamba_ssm.py b/tests/kernels/test_mamba_ssm.py
index 19d1158c79c73..84d4c347e0d81 100644
--- a/tests/kernels/test_mamba_ssm.py
+++ b/tests/kernels/test_mamba_ssm.py
@@ -1,3 +1,5 @@
+# SPDX-License-Identifier: Apache-2.0
+
 import pytest
 import torch
 import torch.nn.functional as F
diff --git a/tests/kernels/test_marlin_gemm.py b/tests/kernels/test_marlin_gemm.py
index 5e047f4b099f1..b96aca06cdff3 100644
--- a/tests/kernels/test_marlin_gemm.py
+++ b/tests/kernels/test_marlin_gemm.py
@@ -1,3 +1,4 @@
+# SPDX-License-Identifier: Apache-2.0
 """Tests for the marlin kernel.
 
 Run `pytest tests/kernels/marlin/test_marlin_gemm.py`.
diff --git a/tests/kernels/test_mha_attn.py b/tests/kernels/test_mha_attn.py
new file mode 100644
index 0000000000000..5a18b7916f0f6
--- /dev/null
+++ b/tests/kernels/test_mha_attn.py
@@ -0,0 +1,127 @@
+# SPDX-License-Identifier: Apache-2.0
+"""
+Test:
+
+* Tests for MultiHeadAttention layer
+"""
+from unittest.mock import patch
+
+import pytest
+import torch
+
+from vllm.attention.layer import MultiHeadAttention
+from vllm.attention.selector import _Backend, _cached_get_attn_backend
+from vllm.platforms import current_platform
+from vllm.platforms.cpu import CpuPlatform
+from vllm.platforms.cuda import CudaPlatform
+from vllm.platforms.rocm import RocmPlatform
+
+
+@pytest.fixture(autouse=True)
+def clear_cache():
+    """Clear lru cache to ensure each test case runs without caching.
+    """
+    _cached_get_attn_backend.cache_clear()
+
+
+@pytest.mark.parametrize("device", ["cpu", "hip", "cuda"])
+def test_mha_attn_platform(device: str):
+    """
+    Test the attention selector between different platform and device.
+    """
+    torch.set_default_dtype(torch.float16)
+
+    if device == "cpu":
+        with patch("vllm.attention.selector.current_platform", CpuPlatform()):
+            attn = MultiHeadAttention(16, 64, scale=1)
+            assert attn.attn_backend == _Backend.TORCH_SDPA
+    elif device == "hip":
+        with patch("vllm.attention.selector.current_platform", RocmPlatform()):
+            attn = MultiHeadAttention(16, 64, scale=1)
+            assert attn.attn_backend == _Backend.TORCH_SDPA
+    else:
+        with patch("vllm.attention.selector.current_platform", CudaPlatform()):
+            attn = MultiHeadAttention(16, 64, scale=1)
+            assert attn.attn_backend == _Backend.XFORMERS
+
+        with patch("vllm.attention.selector.current_platform", CudaPlatform()):
+            attn = MultiHeadAttention(16, 72, scale=1)
+            assert attn.attn_backend == _Backend.XFORMERS
+
+
+def ref_attention(
+    query: torch.Tensor,
+    key: torch.Tensor,
+    value: torch.Tensor,
+    scale: float,
+) -> torch.Tensor:
+    """
+    Native implementation of scaled dot product attention without mask:
+    - query, key, value: [batch_size, seq_len, num_heads, head_size]
+    - attn_mask: [batch_size, seq_len, seq_len]
+    """
+    query, key, value = (x.transpose(1, 2) for x in (query, key, value))
+    attn_weights = scale * torch.matmul(query, key.transpose(2, 3))
+    attn_weights = torch.softmax(attn_weights, dim=-1).to(value.dtype)
+    out = torch.matmul(attn_weights, value).transpose(1, 2)
+    return out
+
+
+BATCH_SIZES = [1, 16]
+SEQ_LENS = [1]
+NUM_HEADS = [1, 16]
+NUM_KV_HEADS = [1]
+HEAD_SIZES = [64, 80]
+# flshattF and tritonflashattF supported: {torch.float16, torch.bfloat16}
+DTYPES = [
+    torch.half, torch.bfloat16, torch.float
+] if not current_platform.is_rocm() else [torch.half, torch.bfloat16]
+CUDA_DEVICES = ["cuda"]
+
+
+@pytest.mark.parametrize("batch_size", BATCH_SIZES)
+@pytest.mark.parametrize("seq_len", SEQ_LENS)
+@pytest.mark.parametrize("num_heads", NUM_HEADS)
+@pytest.mark.parametrize("num_kv_heads", NUM_KV_HEADS)
+@pytest.mark.parametrize("head_size", HEAD_SIZES)
+@pytest.mark.parametrize("dtype", DTYPES)
+@pytest.mark.parametrize("device", CUDA_DEVICES)
+def test_mha_attn_forward(
+    batch_size: int,
+    seq_len: int,
+    num_heads: int,
+    num_kv_heads: int,
+    head_size: int,
+    dtype: torch.dtype,
+    device: str,
+):
+    current_platform.seed_everything(0)
+    torch.set_default_device(device)
+    torch.set_default_dtype(dtype)
+
+    q = torch.randn(batch_size, seq_len, num_heads * head_size)
+    k = torch.randn(batch_size, seq_len, num_kv_heads * head_size)
+    v = torch.randn(batch_size, seq_len, num_kv_heads * head_size)
+    scale = 1.0 / head_size**0.5
+    attn = MultiHeadAttention(num_heads,
+                              head_size,
+                              scale=scale,
+                              num_kv_heads=num_kv_heads)
+    output = attn(q, k, v)
+
+    assert num_heads % num_kv_heads == 0
+    num_queries_per_kv = num_heads // num_kv_heads
+    q = q.reshape(batch_size, seq_len, num_heads, head_size)
+    k = k.reshape(batch_size, seq_len, num_kv_heads, head_size)
+    v = v.reshape(batch_size, seq_len, num_kv_heads, head_size)
+    if num_queries_per_kv > 1:
+        k = torch.repeat_interleave(k, num_queries_per_kv, dim=2)
+        v = torch.repeat_interleave(v, num_queries_per_kv, dim=2)
+
+    ref_output = ref_attention(
+        q,
+        k,
+        v,
+        scale=scale,
+    ).reshape(batch_size, seq_len, num_heads * head_size)
+    torch.testing.assert_close(output, ref_output)
diff --git a/tests/kernels/test_moe.py b/tests/kernels/test_moe.py
index 7fa5de1984452..0f13fbc965030 100644
--- a/tests/kernels/test_moe.py
+++ b/tests/kernels/test_moe.py
@@ -1,3 +1,4 @@
+# SPDX-License-Identifier: Apache-2.0
 """Tests for the MOE layers.
 
 Run `pytest tests/kernels/test_moe.py`.
@@ -18,6 +19,8 @@ from vllm.model_executor.layers.fused_moe.moe_torch_iterative import (
     fused_moe as iterative_moe)
 from vllm.model_executor.layers.quantization.utils.marlin_utils_test import (
     marlin_quantize)
+from vllm.model_executor.layers.quantization.utils.quant_utils import (
+    quantize_weights)
 from vllm.model_executor.models.mixtral import MixtralMoE
 from vllm.platforms import current_platform
 from vllm.scalar_type import scalar_types
@@ -55,6 +58,95 @@ def test_fused_moe(
                                rtol=0)
 
 
+@pytest.mark.parametrize("m", [1, 32, 222])
+@pytest.mark.parametrize("n", [128, 1024, 2048])
+@pytest.mark.parametrize("k", [128, 1024])
+@pytest.mark.parametrize("e", NUM_EXPERTS)
+@pytest.mark.parametrize("topk", TOP_KS)
+@pytest.mark.parametrize("dtype", [torch.float16, torch.bfloat16])
+@pytest.mark.parametrize("group_size", [64, 128])
+@pytest.mark.parametrize("has_zp", [True, False])
+@pytest.mark.parametrize("weight_bits", [4, 8])
+def test_fused_moe_wn16(m: int, n: int, k: int, e: int, topk: int,
+                        dtype: torch.dtype, group_size: int, has_zp: bool,
+                        weight_bits: int):
+    print(m, n, k, e, topk, dtype, group_size, has_zp, weight_bits)
+    a = torch.randn((m, k), device="cuda", dtype=dtype) / 10
+    w1 = torch.randn((e, 2 * n, k), device="cuda", dtype=dtype) / 10
+    w2 = torch.randn((e, k, n), device="cuda", dtype=dtype) / 10
+    score = torch.randn((m, e), device="cuda", dtype=dtype)
+
+    if weight_bits == 4:
+        pack_factor = 2
+        quant_type = scalar_types.uint4 if has_zp else scalar_types.uint4b8
+    elif weight_bits == 8:
+        pack_factor = 1
+        quant_type = scalar_types.uint8 if has_zp else scalar_types.uint8b128
+
+    w1_ref = w1.clone()
+    w2_ref = w2.clone()
+    w1_qweight = torch.empty((e, 2 * n, k // pack_factor),
+                             device="cuda",
+                             dtype=torch.uint8)
+    w2_qweight = torch.empty((e, k, n // pack_factor),
+                             device="cuda",
+                             dtype=torch.uint8)
+    w1_scales = torch.empty((e, 2 * n, k // group_size),
+                            device="cuda",
+                            dtype=dtype)
+    w2_scales = torch.empty((e, k, n // group_size),
+                            device="cuda",
+                            dtype=dtype)
+    w1_qzeros = torch.empty((e, 2 * n // pack_factor, k // group_size),
+                            device="cuda",
+                            dtype=torch.uint8)
+    w2_qzeros = torch.empty((e, k // pack_factor, n // group_size),
+                            device="cuda",
+                            dtype=torch.uint8)
+
+    for i in range(e * 2):
+        expert_id = i % e
+        if i // e == 0:
+            w, w_ref, w_qweight, w_scales, w_qzeros = \
+                w1, w1_ref, w1_qweight, w1_scales, w1_qzeros
+        else:
+            w, w_ref, w_qweight, w_scales, w_qzeros = \
+                w2, w2_ref, w2_qweight, w2_scales, w2_qzeros
+        weight, qweight, scales, qzeros = quantize_weights(
+            w[expert_id].T, quant_type, group_size, has_zp, False)
+        weight = weight.T
+        qweight = qweight.T.contiguous().to(torch.uint8)
+        scales = scales.T
+        if has_zp:
+            qzeros = qzeros.T.contiguous().to(torch.uint8)
+        if weight_bits == 4:
+            qweight = qweight[:, 1::2] * 16 + qweight[:, ::2]
+            if has_zp:
+                qzeros = qzeros[1::2, :] * 16 + qzeros[::2, :]
+
+        w_ref[expert_id] = weight
+        w_qweight[expert_id] = qweight
+        w_scales[expert_id] = scales
+        if has_zp:
+            w_qzeros[expert_id] = qzeros
+
+    triton_output = fused_moe(a,
+                              w1_qweight,
+                              w2_qweight,
+                              score,
+                              topk,
+                              renormalize=False,
+                              use_int4_w4a16=weight_bits == 4,
+                              use_int8_w8a16=weight_bits == 8,
+                              w1_scale=w1_scales,
+                              w2_scale=w2_scales,
+                              w1_zp=w1_qzeros if has_zp else None,
+                              w2_zp=w2_qzeros if has_zp else None,
+                              block_shape=[0, group_size])
+    torch_output = torch_moe(a, w1_ref, w2_ref, score, topk)
+    torch.testing.assert_close(triton_output, torch_output, atol=2e-2, rtol=0)
+
+
 @pytest.mark.parametrize("dtype",
                          [torch.float32, torch.float16, torch.bfloat16])
 @torch.inference_mode()
diff --git a/tests/kernels/test_permute_cols.py b/tests/kernels/test_permute_cols.py
index 14ad7a22cf7cf..35d62079fb65d 100644
--- a/tests/kernels/test_permute_cols.py
+++ b/tests/kernels/test_permute_cols.py
@@ -1,3 +1,5 @@
+# SPDX-License-Identifier: Apache-2.0
+
 import pytest
 import torch
 
diff --git a/tests/kernels/test_pos_encoding.py b/tests/kernels/test_pos_encoding.py
index eee77c22ab81a..5b7b0fda2be6a 100644
--- a/tests/kernels/test_pos_encoding.py
+++ b/tests/kernels/test_pos_encoding.py
@@ -1,3 +1,5 @@
+# SPDX-License-Identifier: Apache-2.0
+
 from itertools import accumulate, product
 from typing import Dict, List, Optional
 
diff --git a/tests/kernels/test_prefix_prefill.py b/tests/kernels/test_prefix_prefill.py
index 10e73ab950b0e..2184c98525fe2 100644
--- a/tests/kernels/test_prefix_prefill.py
+++ b/tests/kernels/test_prefix_prefill.py
@@ -1,3 +1,5 @@
+# SPDX-License-Identifier: Apache-2.0
+
 import math
 import random
 import time
diff --git a/tests/kernels/test_rotary_embedding.py b/tests/kernels/test_rotary_embedding.py
index da879406b3936..362bcb35ceabf 100644
--- a/tests/kernels/test_rotary_embedding.py
+++ b/tests/kernels/test_rotary_embedding.py
@@ -1,3 +1,4 @@
+# SPDX-License-Identifier: Apache-2.0
 """
 Tests for miscellaneous utilities
 """
diff --git a/tests/kernels/test_semi_structured.py b/tests/kernels/test_semi_structured.py
deleted file mode 100644
index 4316d6ab30e33..0000000000000
--- a/tests/kernels/test_semi_structured.py
+++ /dev/null
@@ -1,134 +0,0 @@
-"""Tests for sparse cutlass kernels
-
-Run `pytest tests/kernels/test_semi_structured.py`.
-"""
-from typing import Optional, Tuple, Type
-
-import pytest
-import torch
-
-from vllm import _custom_ops as ops
-from vllm.model_executor.layers.quantization.utils.w8a8_utils import (
-    sparse_cutlass_supported)
-from vllm.platforms import current_platform
-
-CUDA_DEVICES = [
-    f"cuda:{i}" for i in range(1 if torch.cuda.device_count() == 1 else 2)
-]
-
-capability = current_platform.get_device_capability()
-capability = capability[0] * 10 + capability[1]
-
-
-def to_fp8(tensor: torch.Tensor):
-    finfo = torch.finfo(torch.float8_e4m3fn)
-    return torch.round(tensor.clamp(
-        min=finfo.min, max=finfo.max)).to(dtype=torch.float8_e4m3fn)
-
-
-def to_int8(tensor: torch.Tensor):
-    return torch.round(tensor.clamp(min=-128, max=127)).to(dtype=torch.int8)
-
-
-def rand_int8(shape: tuple, device: str = "cuda"):
-    return to_int8(torch.rand(shape, device=device) * 255 - 128)
-
-
-def to_bf16(tensor: torch.Tensor) -> torch.Tensor:
-    return tensor.to(dtype=torch.bfloat16)
-
-
-def to_fp16(tensor: torch.Tensor) -> torch.Tensor:
-    return tensor.to(dtype=torch.float16)
-
-
-def prune_to_2_4(tensor):
-    # Reshape tensor to [N, 4] where N is number of groups of 4
-    original_shape = tensor.shape
-    reshaped = tensor.reshape(-1, 4)
-
-    # Get indices of top 2 absolute values in each group of 4
-    _, indices = torch.topk(torch.abs(reshaped), k=2, dim=1)
-
-    # Create binary mask
-    mask = torch.zeros_like(reshaped)
-    mask.scatter_(dim=1,
-                  index=indices,
-                  src=torch.ones_like(indices, dtype=mask.dtype))
-
-    # Apply mask and reshape back
-    pruned = reshaped * mask
-
-    # Turn all -0.0 to 0.0
-    pruned[pruned == -0.0] = 0.0
-
-    return pruned.reshape(original_shape)
-
-
-def make_rand_sparse_tensors(
-        dtype: torch.dtype, m: int, n: int, k: int
-) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor]:
-    a = torch.randn((m, k), device='cuda') * 5
-    b = torch.randn((n, k), device='cuda').t() * 5
-
-    b = prune_to_2_4(b.t()).t()
-
-    if dtype == torch.int8:
-        a, b = to_int8(a), to_int8(b)
-    elif dtype == torch.float8_e4m3fn:
-        a, b = to_fp8(a), to_fp8(b)
-    elif dtype == torch.float16:
-        a, b = to_fp16(a), to_fp16(b)
-    elif dtype == torch.bfloat16:
-        a, b = to_bf16(a), to_bf16(b)
-    else:
-        raise ValueError("unsupported dtype")
-
-    b_compressed, e = ops.cutlass_sparse_compress(b.t())
-
-    # Compressed B, Metadata, Original A, B
-    return b_compressed, e, a, b
-
-
-def baseline_scaled_mm(a: torch.Tensor,
-                       b: torch.Tensor,
-                       scale_a: torch.Tensor,
-                       scale_b: torch.Tensor,
-                       out_dtype: Type[torch.dtype],
-                       bias: Optional[torch.Tensor] = None) -> torch.Tensor:
-    output = (scale_a * (scale_b * (torch.mm(
-        a.to(dtype=torch.float32), b.to(dtype=torch.float32))))).to(out_dtype)
-    if bias is not None:
-        output = output + bias
-
-    return output
-
-
-@pytest.mark.skipif(not sparse_cutlass_supported(),
-                    reason="Sparse FP8 is not yet supported on this GPU type.")
-# Test working with a subset of A and B for sparse matmul
-def test_cutlass_sparse_subset():
-
-    big_m = 1024
-    m, n, k = 512, 512, 512
-
-    # Create tensors
-    b_comp, e, whole_a, b = make_rand_sparse_tensors(torch.float8_e4m3fn,
-                                                     big_m, n, k)
-    a = whole_a[0:m, 0:k]
-    scale_a = torch.randn((1, 1), device="cuda", dtype=torch.float32) / 10
-    scale_b = torch.randn((1, 1), device="cuda", dtype=torch.float32) / 10
-
-    out = ops.cutlass_scaled_sparse_mm(a,
-                                       b_comp,
-                                       e,
-                                       scale_a,
-                                       scale_b,
-                                       out_dtype=torch.bfloat16)
-    baseline = baseline_scaled_mm(a,
-                                  b,
-                                  scale_a,
-                                  scale_b,
-                                  out_dtype=torch.bfloat16)
-
-    torch.testing.assert_close(out, baseline, rtol=1e-1, atol=1e0)
diff --git a/tests/kernels/test_triton_decode_attention.py b/tests/kernels/test_triton_decode_attention.py
new file mode 100644
index 0000000000000..fd3c9fa4196a7
--- /dev/null
+++ b/tests/kernels/test_triton_decode_attention.py
@@ -0,0 +1,91 @@
+# SPDX-License-Identifier: Apache-2.0
+
+import pytest
+import torch
+
+from vllm.attention.ops.triton_decode_attention import decode_attention_fwd
+
+
+def cdiv(a, b):
+    return (a + b - 1) // b
+
+
+@pytest.mark.parametrize("B", [3, 5])
+@pytest.mark.parametrize("L", [1027, 1025])
+@pytest.mark.parametrize("H_Q", [32])
+@pytest.mark.parametrize("H_KV", [32, 8])
+@pytest.mark.parametrize("D_QK", [128, 192, 576])
+@pytest.mark.parametrize("D_V", [128, 512])
+@pytest.mark.parametrize("CACHE_SIZE", [16384])
+@pytest.mark.parametrize("PAGE_SIZE", [1, 16])
+def test_decode_attention(B, L, H_Q, H_KV, D_QK, D_V, CACHE_SIZE, PAGE_SIZE):
+    assert CACHE_SIZE % PAGE_SIZE == 0
+    dtype = torch.bfloat16
+    seq_len = L  # This represents the number of tokens already in the sequence
+    sm_scale = 1.0 / (D_QK**0.5)
+    num_kv_splits = 8
+
+    num_pages_per_batch = cdiv(seq_len, PAGE_SIZE)
+    req_to_page = torch.randint(0,
+                                CACHE_SIZE // PAGE_SIZE,
+                                (B, num_pages_per_batch, 1),
+                                device="cuda")
+    req_to_token = req_to_page * PAGE_SIZE
+    req_to_token = req_to_token.expand(B, num_pages_per_batch, PAGE_SIZE)
+    req_to_token = req_to_token + torch.arange(PAGE_SIZE, device="cuda").view(
+        1, 1, -1)
+    req_to_token = req_to_token.view(B, -1)
+    req_to_token = req_to_token[:, :seq_len].contiguous()
+
+    # q represents the new token being generated, one per batch
+    q = torch.randn(B, H_Q, D_QK, dtype=dtype, device="cuda")
+
+    # k_buffer and v_buffer represent all previous tokens
+    # Page size is 1.
+    k_buffer = torch.randn(CACHE_SIZE, H_KV, D_QK, dtype=dtype, device="cuda")
+    v_buffer = torch.randn(CACHE_SIZE, H_KV, D_V, dtype=dtype, device="cuda")
+
+    # o will have the same shape as q
+    o = torch.zeros(B, H_Q, D_V, dtype=dtype, device="cuda")
+
+    b_seq_len = torch.full((B, ), seq_len, device="cuda")
+
+    attn_logits = torch.empty(
+        (B, H_Q, num_kv_splits, D_V + 1),
+        dtype=torch.float32,
+        device="cuda",
+    )
+
+    # Call the original implementation.
+    decode_attention_fwd(
+        q,
+        k_buffer,
+        v_buffer,
+        o,
+        req_to_token,
+        b_seq_len,
+        attn_logits,
+        num_kv_splits,
+        sm_scale,
+    )
+
+    # Page size can be larger than 1.
+    k_buffer = k_buffer.view(CACHE_SIZE // PAGE_SIZE, PAGE_SIZE, H_KV, D_QK)
+    v_buffer = v_buffer.view(CACHE_SIZE // PAGE_SIZE, PAGE_SIZE, H_KV, D_V)
+
+    o1 = torch.zeros_like(o)
+
+    decode_attention_fwd(
+        q,
+        k_buffer,
+        v_buffer,
+        o1,
+        req_to_page,
+        b_seq_len,
+        attn_logits,
+        num_kv_splits,
+        sm_scale,
+        PAGE_SIZE,
+    )
+
+    assert torch.allclose(o, o1)
diff --git a/tests/kernels/test_triton_scaled_mm.py b/tests/kernels/test_triton_scaled_mm.py
index a5aab3c2ea4b0..d878ed6f45144 100644
--- a/tests/kernels/test_triton_scaled_mm.py
+++ b/tests/kernels/test_triton_scaled_mm.py
@@ -1,3 +1,4 @@
+# SPDX-License-Identifier: Apache-2.0
 """Tests for the triton_scaled_mm kernel
 
 Run `pytest tests/kernels/test_triton_scaled_mm.py`.
diff --git a/tests/kernels/test_utils.py b/tests/kernels/test_utils.py
index 7e5126a76f88b..d3f0320026519 100644
--- a/tests/kernels/test_utils.py
+++ b/tests/kernels/test_utils.py
@@ -1,3 +1,4 @@
+# SPDX-License-Identifier: Apache-2.0
 """
 Tests for miscellaneous utilities
 """
diff --git a/tests/kernels/utils.py b/tests/kernels/utils.py
index 8011398551b9d..5be111d713082 100644
--- a/tests/kernels/utils.py
+++ b/tests/kernels/utils.py
@@ -1,3 +1,4 @@
+# SPDX-License-Identifier: Apache-2.0
 """Kernel test utils"""
 
 import itertools
@@ -5,7 +6,7 @@ import random
 import unittest
 from numbers import Number
 from typing import (Any, Dict, List, NamedTuple, Optional, Sequence, Tuple,
-                    Union)
+                    Type, Union)
 
 import pytest
 import torch
@@ -1100,3 +1101,56 @@ def opcheck(op: Union[torch._ops.OpOverload, torch._ops.OpOverloadPacket,
             kwargs,
             test_utils=test_utils,
             raise_exception=raise_exception) if cond else {}
+
+
+# For testing quantized linear kernels
+def to_fp8(tensor: torch.Tensor):
+    finfo = torch.finfo(torch.float8_e4m3fn)
+    return torch.round(tensor.clamp(
+        min=finfo.min, max=finfo.max)).to(dtype=torch.float8_e4m3fn)
+
+
+def to_int8(tensor: torch.Tensor):
+    return torch.round(tensor.clamp(min=-128, max=127)).to(dtype=torch.int8)
+
+
+def baseline_scaled_mm(a: torch.Tensor,
+                       b: torch.Tensor,
+                       scale_a: torch.Tensor,
+                       scale_b: torch.Tensor,
+                       out_dtype: Type[torch.dtype],
+                       bias: Optional[torch.Tensor] = None) -> torch.Tensor:
+
+    # We treat N-dimensional group scaling as extended numpy-style broadcasting
+    # in numpy simply stretches dimensions with an extent of 1 to match the
+    # the target shape by repeating the data along that dimension (broadcasting)
+    # , we extend these semantics to say if the extent of a dimension in the
+    # source shape is not 1 and does not match the target shape we repeat each
+    # element along that dimension src_shape[dim] // target_shape[dim] times
+    # example if we have:
+    #       a = [[1, 2], and target_shape = (2, 4)
+    #            [3, 4]]
+    # then we would expand a to:
+    #       a = [[1, 1, 2, 2],
+    #            [3, 3, 4, 4]]
+    # NOTE this function this function does not explicitly broadcast dimensions
+    # with an extent of 1, since this can be done implicitly by pytorch
+    def group_broadcast(t, shape):
+        for i, s in enumerate(shape):
+            if t.shape[i] != s and t.shape[i] != 1:
+                assert s % t.shape[i] == 0
+                t = t.unsqueeze(i + 1)\
+                  .expand(*t.shape[:i+1], s // t.shape[i], *t.shape[i+1:])\
+                  .flatten(i, i + 1)
+        return t
+
+    scale_a = group_broadcast(scale_a, a.shape)
+    scale_b = group_broadcast(scale_b, b.shape)
+
+    output = torch.mm((scale_a * a.to(dtype=torch.float32)),
+                      (scale_b * b.to(dtype=torch.float32))).to(out_dtype)
+
+    if bias is not None:
+        output = output + bias
+
+    return output
diff --git a/tests/kv_transfer/disagg_test.py b/tests/kv_transfer/disagg_test.py
index adc6150edece6..97e0d6eb1f935 100644
--- a/tests/kv_transfer/disagg_test.py
+++ b/tests/kv_transfer/disagg_test.py
@@ -1,3 +1,5 @@
+# SPDX-License-Identifier: Apache-2.0
+
 import os
 import subprocess
 import sys
diff --git a/tests/kv_transfer/module_test.py b/tests/kv_transfer/module_test.py
index 355461919cd7c..8a6490b5c8876 100644
--- a/tests/kv_transfer/module_test.py
+++ b/tests/kv_transfer/module_test.py
@@ -1,3 +1,5 @@
+# SPDX-License-Identifier: Apache-2.0
+
 import subprocess
 import sys
 
diff --git a/tests/kv_transfer/test_lookup_buffer.py b/tests/kv_transfer/test_lookup_buffer.py
index 718730bb8cbbe..c5b34660d1658 100644
--- a/tests/kv_transfer/test_lookup_buffer.py
+++ b/tests/kv_transfer/test_lookup_buffer.py
@@ -1,3 +1,5 @@
+# SPDX-License-Identifier: Apache-2.0
+
 import os
 import random
 
@@ -20,7 +22,7 @@ def test_run(my_rank, buffer, device):
         assert buffer.buffer_size == 0
         assert len(buffer.buffer) == 0
 
-    print("My rank: %d, device: %s" % (my_rank, device))
+    print(f"My rank: {my_rank}, device: {device}")
 
     # insert
     tokens = torch.tensor([1, 2, 3]).to(device)
@@ -48,7 +50,7 @@ def test_run(my_rank, buffer, device):
         assert buffer.buffer_size == 0
         assert len(buffer.buffer) == 0
 
-    print("My rank: %d, Test run passed!" % (my_rank))
+    print(f"My rank: {my_rank}, Test run passed!")
 
 
 def stress_test(my_rank, buf, device):
@@ -94,7 +96,7 @@ def stress_test(my_rank, buf, device):
                 assert torch.allclose(k, k_)
                 assert torch.allclose(v, v_)
                 assert torch.allclose(h, h_)
-    print('Rank %d done' % my_rank)
+    print(f"Rank {my_rank} done")
     torch.distributed.barrier()
 
     if my_rank == 0:
@@ -108,7 +110,7 @@ def stress_test(my_rank, buf, device):
     else:
         torch.distributed.send(torch.tensor([n]), 0)
 
-    print("My rank: %d, Passed stress test!" % (my_rank))
+    print(f"My rank: {my_rank}, Passed stress test!")
 
 
 if __name__ == "__main__":
@@ -122,7 +124,7 @@ if __name__ == "__main__":
         rank=my_rank,
     )
 
-    print("initialized! My rank is %d" % my_rank)
+    print(f"initialized! My rank is {my_rank}")
 
     config = KVTransferConfig(
         kv_connector='PyNcclConnector',
diff --git a/tests/kv_transfer/test_send_recv.py b/tests/kv_transfer/test_send_recv.py
index 1cc1ced9968d7..181a5ac207fe5 100644
--- a/tests/kv_transfer/test_send_recv.py
+++ b/tests/kv_transfer/test_send_recv.py
@@ -1,3 +1,5 @@
+# SPDX-License-Identifier: Apache-2.0
+
 import os
 import time
 from typing import List
diff --git a/tests/lora/conftest.py b/tests/lora/conftest.py
index e7378d00765f0..071cdbecc689a 100644
--- a/tests/lora/conftest.py
+++ b/tests/lora/conftest.py
@@ -1,3 +1,5 @@
+# SPDX-License-Identifier: Apache-2.0
+
 import tempfile
 from collections import OrderedDict
 from typing import Dict, List, TypedDict
diff --git a/tests/lora/data/long_context_test_data.py b/tests/lora/data/long_context_test_data.py
index 61b8899f0533c..2d33f738bd874 100644
--- a/tests/lora/data/long_context_test_data.py
+++ b/tests/lora/data/long_context_test_data.py
@@ -1,3 +1,5 @@
+# SPDX-License-Identifier: Apache-2.0
+
 # ruff: noqa
 """This file contains a dictionary of prompts and golden responses."""
 
diff --git a/tests/lora/test_baichuan.py b/tests/lora/test_baichuan.py
index 0ba2ce3617b67..249f7619d6246 100644
--- a/tests/lora/test_baichuan.py
+++ b/tests/lora/test_baichuan.py
@@ -1,3 +1,5 @@
+# SPDX-License-Identifier: Apache-2.0
+
 from typing import List
 
 import pytest
diff --git a/tests/lora/test_chatglm3_tp.py b/tests/lora/test_chatglm3_tp.py
index 49a527b99ac16..0aa9fe7a94948 100644
--- a/tests/lora/test_chatglm3_tp.py
+++ b/tests/lora/test_chatglm3_tp.py
@@ -1,3 +1,5 @@
+# SPDX-License-Identifier: Apache-2.0
+
 from typing import List
 
 import vllm
diff --git a/tests/lora/test_gemma.py b/tests/lora/test_gemma.py
index 5ae705e474ec6..8923aa2210a55 100644
--- a/tests/lora/test_gemma.py
+++ b/tests/lora/test_gemma.py
@@ -1,3 +1,5 @@
+# SPDX-License-Identifier: Apache-2.0
+
 from typing import List
 
 import pytest
diff --git a/tests/lora/test_jamba.py b/tests/lora/test_jamba.py
index 6aa33926cb6b8..c04174665897c 100644
--- a/tests/lora/test_jamba.py
+++ b/tests/lora/test_jamba.py
@@ -1,3 +1,5 @@
+# SPDX-License-Identifier: Apache-2.0
+
 from typing import List
 
 import pytest
diff --git a/tests/lora/test_layers.py b/tests/lora/test_layers.py
index 08a589d7ee29c..0838ca02c9b7a 100644
--- a/tests/lora/test_layers.py
+++ b/tests/lora/test_layers.py
@@ -1,3 +1,5 @@
+# SPDX-License-Identifier: Apache-2.0
+
 import random
 from copy import deepcopy
 from dataclasses import dataclass
diff --git a/tests/lora/test_llama_tp.py b/tests/lora/test_llama_tp.py
index dfeac380951d8..39f779f400ca3 100644
--- a/tests/lora/test_llama_tp.py
+++ b/tests/lora/test_llama_tp.py
@@ -1,3 +1,5 @@
+# SPDX-License-Identifier: Apache-2.0
+
 from typing import List
 
 import ray
diff --git a/tests/lora/test_long_context.py b/tests/lora/test_long_context.py
index e7a34f2ced7ed..62005de73ddb7 100644
--- a/tests/lora/test_long_context.py
+++ b/tests/lora/test_long_context.py
@@ -1,3 +1,5 @@
+# SPDX-License-Identifier: Apache-2.0
+
 import ast
 from typing import List, Optional, Tuple
 
diff --git a/tests/lora/test_lora_bias_e2e.py b/tests/lora/test_lora_bias_e2e.py
index c2520c847d873..cbdd688311d74 100644
--- a/tests/lora/test_lora_bias_e2e.py
+++ b/tests/lora/test_lora_bias_e2e.py
@@ -1,3 +1,5 @@
+# SPDX-License-Identifier: Apache-2.0
+
 from typing import List
 
 import pytest
diff --git a/tests/lora/test_lora_checkpoints.py b/tests/lora/test_lora_checkpoints.py
index b907af47d08d7..d2a4b901bd8d7 100644
--- a/tests/lora/test_lora_checkpoints.py
+++ b/tests/lora/test_lora_checkpoints.py
@@ -1,3 +1,5 @@
+# SPDX-License-Identifier: Apache-2.0
+
 from typing import List
 
 import pytest
diff --git a/tests/lora/test_lora_huggingface.py b/tests/lora/test_lora_huggingface.py
index 1c0ee01c038d0..273fe9ae0eb55 100644
--- a/tests/lora/test_lora_huggingface.py
+++ b/tests/lora/test_lora_huggingface.py
@@ -1,3 +1,5 @@
+# SPDX-License-Identifier: Apache-2.0
+
 from typing import List
 
 import pytest
diff --git a/tests/lora/test_lora_manager.py b/tests/lora/test_lora_manager.py
index 9a5b9aabf5078..6666f54fdebd1 100644
--- a/tests/lora/test_lora_manager.py
+++ b/tests/lora/test_lora_manager.py
@@ -1,3 +1,5 @@
+# SPDX-License-Identifier: Apache-2.0
+
 import os
 from typing import Dict, List
 
diff --git a/tests/lora/test_minicpmv_tp.py b/tests/lora/test_minicpmv_tp.py
index 3b0f18325a40b..2e81bb3267102 100644
--- a/tests/lora/test_minicpmv_tp.py
+++ b/tests/lora/test_minicpmv_tp.py
@@ -1,3 +1,5 @@
+# SPDX-License-Identifier: Apache-2.0
+
 from typing import List
 
 import pytest
diff --git a/tests/lora/test_mixtral.py b/tests/lora/test_mixtral.py
index 940a865228806..90cf8fd39a181 100644
--- a/tests/lora/test_mixtral.py
+++ b/tests/lora/test_mixtral.py
@@ -1,3 +1,5 @@
+# SPDX-License-Identifier: Apache-2.0
+
 from typing import List
 
 import pytest
diff --git a/tests/lora/test_peft_helper.py b/tests/lora/test_peft_helper.py
index a524d5ce5f34a..9935472ad18f4 100644
--- a/tests/lora/test_peft_helper.py
+++ b/tests/lora/test_peft_helper.py
@@ -1,3 +1,5 @@
+# SPDX-License-Identifier: Apache-2.0
+
 import json
 import math
 import shutil
diff --git a/tests/lora/test_phi.py b/tests/lora/test_phi.py
index 5a3fcb8d690d9..651c89ffce2de 100644
--- a/tests/lora/test_phi.py
+++ b/tests/lora/test_phi.py
@@ -1,3 +1,5 @@
+# SPDX-License-Identifier: Apache-2.0
+
 from typing import List
 
 import vllm
diff --git a/tests/lora/test_punica_ops_sizes.py b/tests/lora/test_punica_ops_sizes.py
index 433ca7577d084..ecd3bc4978f39 100644
--- a/tests/lora/test_punica_ops_sizes.py
+++ b/tests/lora/test_punica_ops_sizes.py
@@ -1,3 +1,4 @@
+# SPDX-License-Identifier: Apache-2.0
 """
 This script is mainly used to tests various hidden_sizes. We have collected the
 hidden_sizes included in the LoRA models currently supported by vLLM. It tests
diff --git a/tests/lora/test_punica_ops_variation.py b/tests/lora/test_punica_ops_variation.py
index 2bb84c1cf11e9..6d1d3c9430f38 100644
--- a/tests/lora/test_punica_ops_variation.py
+++ b/tests/lora/test_punica_ops_variation.py
@@ -1,3 +1,4 @@
+# SPDX-License-Identifier: Apache-2.0
 """
 This script is mainly used to test whether trtion kernels can run normally
 under different conditions, including various batches, numbers of LoRA , and
diff --git a/tests/lora/test_quant_model.py b/tests/lora/test_quant_model.py
index 26bf770cc0d4a..5702aa26bd916 100644
--- a/tests/lora/test_quant_model.py
+++ b/tests/lora/test_quant_model.py
@@ -1,3 +1,5 @@
+# SPDX-License-Identifier: Apache-2.0
+
 # Adapted from
 # https://github.com/fmmoret/vllm/blob/fm-support-lora-on-quantized-models/tests/lora/test_llama.py
 from dataclasses import dataclass
diff --git a/tests/lora/test_qwen2vl.py b/tests/lora/test_qwen2vl.py
index ebdd129db5f6a..a988f06ab25f0 100644
--- a/tests/lora/test_qwen2vl.py
+++ b/tests/lora/test_qwen2vl.py
@@ -1,3 +1,5 @@
+# SPDX-License-Identifier: Apache-2.0
+
 from typing import List
 
 import pytest
@@ -55,9 +57,9 @@ def do_sample(llm: vllm.LLM, lora_path: str, lora_id: int) -> List[str]:
     return generated_texts
 
 
-@pytest.mark.xfail(current_platform.is_rocm(),
-                   reason="Qwen2-VL dependency xformers incompatible with ROCm"
-                   )
+@pytest.mark.xfail(
+    current_platform.is_rocm(),
+    reason="Qwen2-VL dependency xformers incompatible with ROCm")
 def test_qwen2vl_lora(qwen2vl_lora_files):
     llm = vllm.LLM(
         MODEL_PATH,
diff --git a/tests/lora/test_tokenizer_group.py b/tests/lora/test_tokenizer_group.py
index d225a3f7d6c06..589167e801f19 100644
--- a/tests/lora/test_tokenizer_group.py
+++ b/tests/lora/test_tokenizer_group.py
@@ -1,3 +1,5 @@
+# SPDX-License-Identifier: Apache-2.0
+
 import pytest
 from transformers import AutoTokenizer, PreTrainedTokenizerBase
 
diff --git a/tests/lora/test_utils.py b/tests/lora/test_utils.py
index 85110b8fa8cd2..34a26e9edf36a 100644
--- a/tests/lora/test_utils.py
+++ b/tests/lora/test_utils.py
@@ -1,3 +1,5 @@
+# SPDX-License-Identifier: Apache-2.0
+
 from collections import OrderedDict
 from unittest.mock import patch
 
diff --git a/tests/lora/test_worker.py b/tests/lora/test_worker.py
index 9d814f657ac43..797141ea39e80 100644
--- a/tests/lora/test_worker.py
+++ b/tests/lora/test_worker.py
@@ -1,3 +1,5 @@
+# SPDX-License-Identifier: Apache-2.0
+
 import os
 import random
 import tempfile
diff --git a/tests/lora/utils.py b/tests/lora/utils.py
index ce47546f2154b..bda00e08190ef 100644
--- a/tests/lora/utils.py
+++ b/tests/lora/utils.py
@@ -1,3 +1,5 @@
+# SPDX-License-Identifier: Apache-2.0
+
 from typing import Dict, List, Optional
 
 import torch
diff --git a/tests/metrics/test_metrics.py b/tests/metrics/test_metrics.py
index b3c7850556f90..0942c8eed344b 100644
--- a/tests/metrics/test_metrics.py
+++ b/tests/metrics/test_metrics.py
@@ -1,3 +1,5 @@
+# SPDX-License-Identifier: Apache-2.0
+
 import time
 from typing import List
 
diff --git a/tests/model_executor/conftest.py b/tests/model_executor/conftest.py
index 10792b0a04999..b588a1a96638b 100644
--- a/tests/model_executor/conftest.py
+++ b/tests/model_executor/conftest.py
@@ -1,3 +1,5 @@
+# SPDX-License-Identifier: Apache-2.0
+
 import pytest
 
 
diff --git a/tests/model_executor/test_enabled_custom_ops.py b/tests/model_executor/test_enabled_custom_ops.py
index 0a3aba255fd76..2c67808485670 100644
--- a/tests/model_executor/test_enabled_custom_ops.py
+++ b/tests/model_executor/test_enabled_custom_ops.py
@@ -1,3 +1,5 @@
+# SPDX-License-Identifier: Apache-2.0
+
 from typing import List
 
 import pytest
diff --git a/tests/model_executor/test_guided_processors.py b/tests/model_executor/test_guided_processors.py
index be5282d9c8223..64d0928f828fd 100644
--- a/tests/model_executor/test_guided_processors.py
+++ b/tests/model_executor/test_guided_processors.py
@@ -1,3 +1,5 @@
+# SPDX-License-Identifier: Apache-2.0
+
 import pickle
 
 import pytest
diff --git a/tests/model_executor/test_model_load_with_params.py b/tests/model_executor/test_model_load_with_params.py
index 9c1f784c1c93b..760a11993523d 100644
--- a/tests/model_executor/test_model_load_with_params.py
+++ b/tests/model_executor/test_model_load_with_params.py
@@ -1,3 +1,5 @@
+# SPDX-License-Identifier: Apache-2.0
+
 import os
 
 import pytest
diff --git a/tests/model_executor/weight_utils.py b/tests/model_executor/weight_utils.py
index c8b9bed691bba..11dfe4d4995d5 100644
--- a/tests/model_executor/weight_utils.py
+++ b/tests/model_executor/weight_utils.py
@@ -1,3 +1,5 @@
+# SPDX-License-Identifier: Apache-2.0
+
 import os
 import tempfile
 
diff --git a/tests/models/decoder_only/audio_language/test_ultravox.py b/tests/models/decoder_only/audio_language/test_ultravox.py
index 1e329dc4cb22e..fe9361d126120 100644
--- a/tests/models/decoder_only/audio_language/test_ultravox.py
+++ b/tests/models/decoder_only/audio_language/test_ultravox.py
@@ -1,3 +1,5 @@
+# SPDX-License-Identifier: Apache-2.0
+
 from typing import List, Optional, Tuple, Type
 
 import numpy as np
diff --git a/tests/models/decoder_only/language/test_aqlm.py b/tests/models/decoder_only/language/test_aqlm.py
index a8cb5bbf9349e..85557b30d8b02 100644
--- a/tests/models/decoder_only/language/test_aqlm.py
+++ b/tests/models/decoder_only/language/test_aqlm.py
@@ -1,3 +1,4 @@
+# SPDX-License-Identifier: Apache-2.0
 """Compare the outputs of a AQLM model between vLLM and HF Transformers
 
 Run `pytest tests/models/test_aqlm.py`.
diff --git a/tests/models/decoder_only/language/test_fp8.py b/tests/models/decoder_only/language/test_fp8.py
index 5f06f1e3a2fe9..6a0e148d56735 100644
--- a/tests/models/decoder_only/language/test_fp8.py
+++ b/tests/models/decoder_only/language/test_fp8.py
@@ -1,3 +1,5 @@
+# SPDX-License-Identifier: Apache-2.0
+
 # flake8: noqa
 """Tests fp8 models against ground truth generation
 Note: these tests will only pass on L4 GPU.
diff --git a/tests/models/decoder_only/language/test_gguf.py b/tests/models/decoder_only/language/test_gguf.py
index ad8f8a0c320e9..57fe1d5b1515b 100644
--- a/tests/models/decoder_only/language/test_gguf.py
+++ b/tests/models/decoder_only/language/test_gguf.py
@@ -1,3 +1,4 @@
+# SPDX-License-Identifier: Apache-2.0
 """
 Tests gguf models against unquantized models generations
 Note: To pass the test, quantization higher than Q4 should be used
diff --git a/tests/models/decoder_only/language/test_gptq_marlin.py b/tests/models/decoder_only/language/test_gptq_marlin.py
index 037411a18c19f..0f61466c39975 100644
--- a/tests/models/decoder_only/language/test_gptq_marlin.py
+++ b/tests/models/decoder_only/language/test_gptq_marlin.py
@@ -1,3 +1,4 @@
+# SPDX-License-Identifier: Apache-2.0
 """Compares the outputs of gptq vs gptq_marlin 
 Note: GPTQ and Marlin do not have bitwise correctness.
 As a result, in this test, we just confirm that the top selected tokens of the
diff --git a/tests/models/decoder_only/language/test_gptq_marlin_24.py b/tests/models/decoder_only/language/test_gptq_marlin_24.py
index 26cb3ec310701..c8162614849ca 100644
--- a/tests/models/decoder_only/language/test_gptq_marlin_24.py
+++ b/tests/models/decoder_only/language/test_gptq_marlin_24.py
@@ -1,3 +1,4 @@
+# SPDX-License-Identifier: Apache-2.0
 """Compare the outputs of a GPTQ model to a Marlin_24 model.
 
 Note: GPTQ and Marlin_24 do not have bitwise correctness.
diff --git a/tests/models/decoder_only/language/test_granite.py b/tests/models/decoder_only/language/test_granite.py
index 5e93842f46164..119b79d64c969 100644
--- a/tests/models/decoder_only/language/test_granite.py
+++ b/tests/models/decoder_only/language/test_granite.py
@@ -1,3 +1,4 @@
+# SPDX-License-Identifier: Apache-2.0
 """Compare the outputs of HF and vLLM for Granite models using greedy sampling.
 
 Run `pytest tests/models/test_granite.py`.
diff --git a/tests/models/decoder_only/language/test_jamba.py b/tests/models/decoder_only/language/test_jamba.py
index 2e06b10fbb827..cc98f1d7b5ce8 100644
--- a/tests/models/decoder_only/language/test_jamba.py
+++ b/tests/models/decoder_only/language/test_jamba.py
@@ -1,3 +1,5 @@
+# SPDX-License-Identifier: Apache-2.0
+
 import pytest
 
 from tests.utils import multi_gpu_test
diff --git a/tests/models/decoder_only/language/test_mamba.py b/tests/models/decoder_only/language/test_mamba.py
index 1ad4f5aae8f5b..854f4fe4f9195 100644
--- a/tests/models/decoder_only/language/test_mamba.py
+++ b/tests/models/decoder_only/language/test_mamba.py
@@ -1,3 +1,4 @@
+# SPDX-License-Identifier: Apache-2.0
 """Compare the outputs of HF and vLLM when using greedy sampling for Mamba.
 
 Run `pytest tests/models/test_mamba.py`.
diff --git a/tests/models/decoder_only/language/test_mistral.py b/tests/models/decoder_only/language/test_mistral.py
index bdc1571784b5d..17923673023f4 100644
--- a/tests/models/decoder_only/language/test_mistral.py
+++ b/tests/models/decoder_only/language/test_mistral.py
@@ -1,3 +1,4 @@
+# SPDX-License-Identifier: Apache-2.0
 """Compare the outputs of HF and vLLM for Mistral models using greedy sampling.
 
 Run `pytest tests/models/test_mistral.py`.
diff --git a/tests/models/decoder_only/language/test_modelopt.py b/tests/models/decoder_only/language/test_modelopt.py
index 077e50e3a4dfd..66dd979579c42 100644
--- a/tests/models/decoder_only/language/test_modelopt.py
+++ b/tests/models/decoder_only/language/test_modelopt.py
@@ -1,3 +1,5 @@
+# SPDX-License-Identifier: Apache-2.0
+
 # flake8: noqa
 """Tests Model Optimizer fp8 models against ground truth generation
 Note: these tests will only pass on H100
diff --git a/tests/models/decoder_only/language/test_models.py b/tests/models/decoder_only/language/test_models.py
index c7efa4edbbc0a..1ad56241535b8 100644
--- a/tests/models/decoder_only/language/test_models.py
+++ b/tests/models/decoder_only/language/test_models.py
@@ -1,3 +1,4 @@
+# SPDX-License-Identifier: Apache-2.0
 """Compare the outputs of HF and vLLM when using greedy sampling.
 
 Run `pytest tests/models/test_models.py`.
diff --git a/tests/models/decoder_only/language/test_phimoe.py b/tests/models/decoder_only/language/test_phimoe.py
index c997359a2781e..f9757d6ac295e 100644
--- a/tests/models/decoder_only/language/test_phimoe.py
+++ b/tests/models/decoder_only/language/test_phimoe.py
@@ -1,3 +1,4 @@
+# SPDX-License-Identifier: Apache-2.0
 """Compare the outputs of HF and vLLM for moe models using greedy sampling.
 
 Run `pytest tests/models/test_phimoe.py`.
diff --git a/tests/models/decoder_only/vision_language/test_awq.py b/tests/models/decoder_only/vision_language/test_awq.py
index 18ceb34a4e042..31a5cd260a1d9 100644
--- a/tests/models/decoder_only/vision_language/test_awq.py
+++ b/tests/models/decoder_only/vision_language/test_awq.py
@@ -1,3 +1,5 @@
+# SPDX-License-Identifier: Apache-2.0
+
 from typing import List, Optional, Type
 
 import pytest
diff --git a/tests/models/decoder_only/vision_language/test_h2ovl.py b/tests/models/decoder_only/vision_language/test_h2ovl.py
index 7406df253e7f0..9590adf6f73c8 100644
--- a/tests/models/decoder_only/vision_language/test_h2ovl.py
+++ b/tests/models/decoder_only/vision_language/test_h2ovl.py
@@ -1,3 +1,5 @@
+# SPDX-License-Identifier: Apache-2.0
+
 from typing import Optional, Tuple
 
 import pytest
diff --git a/tests/models/decoder_only/vision_language/test_intern_vit.py b/tests/models/decoder_only/vision_language/test_intern_vit.py
index 32fcb0bbc42f1..a842d14fee2e5 100644
--- a/tests/models/decoder_only/vision_language/test_intern_vit.py
+++ b/tests/models/decoder_only/vision_language/test_intern_vit.py
@@ -1,3 +1,5 @@
+# SPDX-License-Identifier: Apache-2.0
+
 from typing import Optional
 
 import pytest
diff --git a/tests/models/decoder_only/vision_language/test_models.py b/tests/models/decoder_only/vision_language/test_models.py
index 14d9a739be318..e3cda8971b785 100644
--- a/tests/models/decoder_only/vision_language/test_models.py
+++ b/tests/models/decoder_only/vision_language/test_models.py
@@ -1,3 +1,4 @@
+# SPDX-License-Identifier: Apache-2.0
 """Common tests for testing .generate() functionality for single / multiple
 image, embedding, and video support for different VLMs in vLLM.
 """
@@ -350,6 +351,20 @@ VLM_TEST_SETTINGS = {
         postprocess_inputs=model_utils.wrap_inputs_post_processor,
         hf_output_post_proc=model_utils.minicpmv_trunc_hf_output,
     ),
+    "minicpmo_26": VLMTestInfo(
+        models=["openbmb/MiniCPM-o-2_6"],
+        test_type=(VLMTestType.IMAGE, VLMTestType.MULTI_IMAGE),
+        prompt_formatter=lambda img_prompt: f"<|begin_of_text|><|start_header_id|>user<|end_header_id|>\n\n{img_prompt}<|eot_id|><|start_header_id|>assistant<|end_header_id|>\n\n",  # noqa: E501
+        img_idx_to_prompt=lambda idx: "(<image>./</image>)\n",
+        max_model_len=4096,
+        max_num_seqs=2,
+        get_stop_token_ids=lambda tok: tok.convert_tokens_to_ids(['<|im_end|>', '<|endoftext|>']),  # noqa: E501
+        postprocess_inputs=model_utils.ignore_inputs_post_processor(
+            "image_sizes"
+        ),
+        hf_output_post_proc=model_utils.minicpmv_trunc_hf_output,
+        patch_hf_runner=model_utils.minicpmo_patch_hf_runner
+    ),
     "minicpmv_26": VLMTestInfo(
         models=["openbmb/MiniCPM-V-2_6"],
         test_type=(VLMTestType.IMAGE, VLMTestType.MULTI_IMAGE),
@@ -521,12 +536,13 @@ VLM_TEST_SETTINGS = _mark_splits(VLM_TEST_SETTINGS, num_groups=2)
 # - image embeddings
 # - video
 # - custom inputs
-@pytest.mark.parametrize("model_type,test_case",
-                         get_parametrized_options(
-                             VLM_TEST_SETTINGS,
-                             test_type=VLMTestType.IMAGE,
-                             fork_new_process_for_each_test=False,
-                         ))
+@pytest.mark.parametrize(
+    "model_type,test_case",
+    get_parametrized_options(
+        VLM_TEST_SETTINGS,
+        test_type=VLMTestType.IMAGE,
+        fork_new_process_for_each_test=False,
+    ))
 def test_single_image_models(tmp_path: PosixPath, model_type: str,
                              test_case: ExpandableVLMTestArgs,
                              hf_runner: Type[HfRunner],
@@ -543,12 +559,13 @@ def test_single_image_models(tmp_path: PosixPath, model_type: str,
     )
 
 
-@pytest.mark.parametrize("model_type,test_case",
-                         get_parametrized_options(
-                             VLM_TEST_SETTINGS,
-                             test_type=VLMTestType.MULTI_IMAGE,
-                             fork_new_process_for_each_test=False,
-                         ))
+@pytest.mark.parametrize(
+    "model_type,test_case",
+    get_parametrized_options(
+        VLM_TEST_SETTINGS,
+        test_type=VLMTestType.MULTI_IMAGE,
+        fork_new_process_for_each_test=False,
+    ))
 def test_multi_image_models(tmp_path: PosixPath, model_type: str,
                             test_case: ExpandableVLMTestArgs,
                             hf_runner: Type[HfRunner],
@@ -565,12 +582,13 @@ def test_multi_image_models(tmp_path: PosixPath, model_type: str,
     )
 
 
-@pytest.mark.parametrize("model_type,test_case",
-                         get_parametrized_options(
-                             VLM_TEST_SETTINGS,
-                             test_type=VLMTestType.EMBEDDING,
-                             fork_new_process_for_each_test=False,
-                         ))
+@pytest.mark.parametrize(
+    "model_type,test_case",
+    get_parametrized_options(
+        VLM_TEST_SETTINGS,
+        test_type=VLMTestType.EMBEDDING,
+        fork_new_process_for_each_test=False,
+    ))
 def test_image_embedding_models(model_type: str,
                                 test_case: ExpandableVLMTestArgs,
                                 hf_runner: Type[HfRunner],
@@ -586,12 +604,13 @@ def test_image_embedding_models(model_type: str,
     )
 
 
-@pytest.mark.parametrize("model_type,test_case",
-                         get_parametrized_options(
-                             VLM_TEST_SETTINGS,
-                             test_type=VLMTestType.VIDEO,
-                             fork_new_process_for_each_test=False,
-                         ))
+@pytest.mark.parametrize(
+    "model_type,test_case",
+    get_parametrized_options(
+        VLM_TEST_SETTINGS,
+        test_type=VLMTestType.VIDEO,
+        fork_new_process_for_each_test=False,
+    ))
 def test_video_models(model_type: str, test_case: ExpandableVLMTestArgs,
                       hf_runner: Type[HfRunner], vllm_runner: Type[VllmRunner],
                       video_assets: _VideoAssets):
@@ -605,12 +624,13 @@ def test_video_models(model_type: str, test_case: ExpandableVLMTestArgs,
     )
 
 
-@pytest.mark.parametrize("model_type,test_case",
-                         get_parametrized_options(
-                             VLM_TEST_SETTINGS,
-                             test_type=VLMTestType.CUSTOM_INPUTS,
-                             fork_new_process_for_each_test=False,
-                         ))
+@pytest.mark.parametrize(
+    "model_type,test_case",
+    get_parametrized_options(
+        VLM_TEST_SETTINGS,
+        test_type=VLMTestType.CUSTOM_INPUTS,
+        fork_new_process_for_each_test=False,
+    ))
 def test_custom_inputs_models(
     model_type: str,
     test_case: ExpandableVLMTestArgs,
@@ -627,12 +647,13 @@ def test_custom_inputs_models(
 
 
 #### Tests filtering for things running each test as a new process
-@pytest.mark.parametrize("model_type,test_case",
-                         get_parametrized_options(
-                             VLM_TEST_SETTINGS,
-                             test_type=VLMTestType.IMAGE,
-                             fork_new_process_for_each_test=True,
-                         ))
+@pytest.mark.parametrize(
+    "model_type,test_case",
+    get_parametrized_options(
+        VLM_TEST_SETTINGS,
+        test_type=VLMTestType.IMAGE,
+        fork_new_process_for_each_test=True,
+    ))
 @fork_new_process_for_each_test
 def test_single_image_models_heavy(tmp_path: PosixPath, model_type: str,
                                    test_case: ExpandableVLMTestArgs,
@@ -650,12 +671,13 @@ def test_single_image_models_heavy(tmp_path: PosixPath, model_type: str,
     )
 
 
-@pytest.mark.parametrize("model_type,test_case",
-                         get_parametrized_options(
-                             VLM_TEST_SETTINGS,
-                             test_type=VLMTestType.MULTI_IMAGE,
-                             fork_new_process_for_each_test=True,
-                         ))
+@pytest.mark.parametrize(
+    "model_type,test_case",
+    get_parametrized_options(
+        VLM_TEST_SETTINGS,
+        test_type=VLMTestType.MULTI_IMAGE,
+        fork_new_process_for_each_test=True,
+    ))
 @fork_new_process_for_each_test
 def test_multi_image_models_heavy(tmp_path: PosixPath, model_type: str,
                                   test_case: ExpandableVLMTestArgs,
@@ -673,12 +695,13 @@ def test_multi_image_models_heavy(tmp_path: PosixPath, model_type: str,
     )
 
 
-@pytest.mark.parametrize("model_type,test_case",
-                         get_parametrized_options(
-                             VLM_TEST_SETTINGS,
-                             test_type=VLMTestType.EMBEDDING,
-                             fork_new_process_for_each_test=True,
-                         ))
+@pytest.mark.parametrize(
+    "model_type,test_case",
+    get_parametrized_options(
+        VLM_TEST_SETTINGS,
+        test_type=VLMTestType.EMBEDDING,
+        fork_new_process_for_each_test=True,
+    ))
 @fork_new_process_for_each_test
 def test_image_embedding_models_heavy(model_type: str,
                                       test_case: ExpandableVLMTestArgs,
@@ -695,12 +718,13 @@ def test_image_embedding_models_heavy(model_type: str,
     )
 
 
-@pytest.mark.parametrize("model_type,test_case",
-                         get_parametrized_options(
-                             VLM_TEST_SETTINGS,
-                             test_type=VLMTestType.VIDEO,
-                             fork_new_process_for_each_test=True,
-                         ))
+@pytest.mark.parametrize(
+    "model_type,test_case",
+    get_parametrized_options(
+        VLM_TEST_SETTINGS,
+        test_type=VLMTestType.VIDEO,
+        fork_new_process_for_each_test=True,
+    ))
 def test_video_models_heavy(model_type: str, test_case: ExpandableVLMTestArgs,
                             hf_runner: Type[HfRunner],
                             vllm_runner: Type[VllmRunner],
@@ -715,12 +739,13 @@ def test_video_models_heavy(model_type: str, test_case: ExpandableVLMTestArgs,
     )
 
 
-@pytest.mark.parametrize("model_type,test_case",
-                         get_parametrized_options(
-                             VLM_TEST_SETTINGS,
-                             test_type=VLMTestType.CUSTOM_INPUTS,
-                             fork_new_process_for_each_test=True,
-                         ))
+@pytest.mark.parametrize(
+    "model_type,test_case",
+    get_parametrized_options(
+        VLM_TEST_SETTINGS,
+        test_type=VLMTestType.CUSTOM_INPUTS,
+        fork_new_process_for_each_test=True,
+    ))
 @fork_new_process_for_each_test
 def test_custom_inputs_models_heavy(
     model_type: str,
diff --git a/tests/models/decoder_only/vision_language/test_phi3v.py b/tests/models/decoder_only/vision_language/test_phi3v.py
index 3a8934adfb076..dd68fe4cd55e7 100644
--- a/tests/models/decoder_only/vision_language/test_phi3v.py
+++ b/tests/models/decoder_only/vision_language/test_phi3v.py
@@ -1,3 +1,5 @@
+# SPDX-License-Identifier: Apache-2.0
+
 import os
 import re
 from typing import List, Optional, Tuple, Type
diff --git a/tests/models/decoder_only/vision_language/test_pixtral.py b/tests/models/decoder_only/vision_language/test_pixtral.py
index 90c0fab99054c..602da2b5f4ee5 100644
--- a/tests/models/decoder_only/vision_language/test_pixtral.py
+++ b/tests/models/decoder_only/vision_language/test_pixtral.py
@@ -1,3 +1,4 @@
+# SPDX-License-Identifier: Apache-2.0
 """Compare the outputs of HF and vLLM for Mistral models using greedy sampling.
 
 Run `pytest tests/models/test_mistral.py`.
@@ -135,10 +136,10 @@ def _dump_outputs_w_logprobs(
     outputs: OutputsLogprobs,
     filename: "StrPath",
 ) -> None:
-    json_data = [(tokens, text,
-                  [{k: asdict(v)
-                    for k, v in token_logprobs.items()}
-                   for token_logprobs in (logprobs or [])])
+    json_data = [(tokens, text, [{
+        k: asdict(v)
+        for k, v in token_logprobs.items()
+    } for token_logprobs in (logprobs or [])])
                  for tokens, text, logprobs in outputs]
 
     with open(filename, "w") as f:
@@ -149,11 +150,10 @@ def load_outputs_w_logprobs(filename: "StrPath") -> OutputsLogprobs:
     with open(filename, "rb") as f:
         json_data = json.load(f)
 
-    return [(tokens, text,
-             [{int(k): Logprob(**v)
-               for k, v in token_logprobs.items()}
-              for token_logprobs in logprobs])
-            for tokens, text, logprobs in json_data]
+    return [(tokens, text, [{
+        int(k): Logprob(**v)
+        for k, v in token_logprobs.items()
+    } for token_logprobs in logprobs]) for tokens, text, logprobs in json_data]
 
 
 @large_gpu_test(min_gb=80)
diff --git a/tests/models/decoder_only/vision_language/test_qwen2_vl.py b/tests/models/decoder_only/vision_language/test_qwen2_vl.py
index 5a485f3d81747..de240a904e477 100644
--- a/tests/models/decoder_only/vision_language/test_qwen2_vl.py
+++ b/tests/models/decoder_only/vision_language/test_qwen2_vl.py
@@ -1,3 +1,5 @@
+# SPDX-License-Identifier: Apache-2.0
+
 from typing import Any, List, Optional, Tuple, Type, TypedDict, Union
 
 import numpy.typing as npt
diff --git a/tests/models/decoder_only/vision_language/vlm_utils/builders.py b/tests/models/decoder_only/vision_language/vlm_utils/builders.py
index 59773be709fa8..539410d18950a 100644
--- a/tests/models/decoder_only/vision_language/vlm_utils/builders.py
+++ b/tests/models/decoder_only/vision_language/vlm_utils/builders.py
@@ -1,3 +1,4 @@
+# SPDX-License-Identifier: Apache-2.0
 """Helpers for building inputs that can be leveraged for different test types.
 """
 from pathlib import PosixPath
diff --git a/tests/models/decoder_only/vision_language/vlm_utils/case_filtering.py b/tests/models/decoder_only/vision_language/vlm_utils/case_filtering.py
index 9bb7134160659..ca4ec21411825 100644
--- a/tests/models/decoder_only/vision_language/vlm_utils/case_filtering.py
+++ b/tests/models/decoder_only/vision_language/vlm_utils/case_filtering.py
@@ -1,3 +1,4 @@
+# SPDX-License-Identifier: Apache-2.0
 """Utils for determining which subset of model tests belong to a specific
 modality, getting all combinations (similar to pytest's parametrization),
 handling multimodal placeholder substitution, and so on.
diff --git a/tests/models/decoder_only/vision_language/vlm_utils/core.py b/tests/models/decoder_only/vision_language/vlm_utils/core.py
index 54b7b0733210f..0aed267692ab1 100644
--- a/tests/models/decoder_only/vision_language/vlm_utils/core.py
+++ b/tests/models/decoder_only/vision_language/vlm_utils/core.py
@@ -1,3 +1,4 @@
+# SPDX-License-Identifier: Apache-2.0
 """Core test implementation to be shared across modalities."""
 from typing import Any, Callable, Dict, List, Optional, Tuple, Type, Union
 
@@ -153,4 +154,4 @@ def process_runner_outputs(
 def process_outputs(output_processor, model, outputs_per_image):
     """Applies a model specific post-processor function to a runner's output"""
     return [[output_processor(res, model) for res in outputs]
-            for outputs in outputs_per_image]
\ No newline at end of file
+            for outputs in outputs_per_image]
diff --git a/tests/models/decoder_only/vision_language/vlm_utils/custom_inputs.py b/tests/models/decoder_only/vision_language/vlm_utils/custom_inputs.py
index 2291f4fa0d0ac..2f03a114ae531 100644
--- a/tests/models/decoder_only/vision_language/vlm_utils/custom_inputs.py
+++ b/tests/models/decoder_only/vision_language/vlm_utils/custom_inputs.py
@@ -1,3 +1,4 @@
+# SPDX-License-Identifier: Apache-2.0
 """Custom input builders for edge-cases in different models."""
 from typing import Callable
 
diff --git a/tests/models/decoder_only/vision_language/vlm_utils/model_utils.py b/tests/models/decoder_only/vision_language/vlm_utils/model_utils.py
index 1ca85c7bb2056..b0a88161c4c98 100644
--- a/tests/models/decoder_only/vision_language/vlm_utils/model_utils.py
+++ b/tests/models/decoder_only/vision_language/vlm_utils/model_utils.py
@@ -1,3 +1,4 @@
+# SPDX-License-Identifier: Apache-2.0
 """Common utility functions relating to different models that are useful
 for manipulating the input / output of HF & vLLM test runners, which are
 typically specific to a small subset of models.
@@ -497,6 +498,17 @@ def mantis_patch_hf_runner(hf_model: HfRunner) -> HfRunner:
     return hf_model
 
 
+def minicpmo_patch_hf_runner(hf_model: HfRunner) -> HfRunner:
+    orig_generate = hf_model.model.generate
+
+    def _generate(self, *args, **kwargs):
+        return orig_generate(*args, decode_text=False, **kwargs)
+
+    hf_model.model.generate = types.MethodType(_generate, hf_model.model)
+
+    return hf_model
+
+
 def _generate_greedy_logprobs_limit(
     self,
     prompts: List[str],
diff --git a/tests/models/decoder_only/vision_language/vlm_utils/runners.py b/tests/models/decoder_only/vision_language/vlm_utils/runners.py
index 2d3b39fe3594e..fb9df37cad92a 100644
--- a/tests/models/decoder_only/vision_language/vlm_utils/runners.py
+++ b/tests/models/decoder_only/vision_language/vlm_utils/runners.py
@@ -1,3 +1,4 @@
+# SPDX-License-Identifier: Apache-2.0
 """Entrypoints for wrapping the core run_test implementation for specific test
 types / modalities.
 """
diff --git a/tests/models/decoder_only/vision_language/vlm_utils/types.py b/tests/models/decoder_only/vision_language/vlm_utils/types.py
index e2e0c6390fcb9..ae3b9d59bf9b6 100644
--- a/tests/models/decoder_only/vision_language/vlm_utils/types.py
+++ b/tests/models/decoder_only/vision_language/vlm_utils/types.py
@@ -1,3 +1,4 @@
+# SPDX-License-Identifier: Apache-2.0
 """Types for writing multimodal model tests."""
 from enum import Enum
 from pathlib import PosixPath
diff --git a/tests/models/embedding/language/test_cls_models.py b/tests/models/embedding/language/test_cls_models.py
index 0cbe4afe96c0a..b0420ff5cc78c 100644
--- a/tests/models/embedding/language/test_cls_models.py
+++ b/tests/models/embedding/language/test_cls_models.py
@@ -1,3 +1,4 @@
+# SPDX-License-Identifier: Apache-2.0
 """Compare the classification outputs of HF and vLLM models.
 
 Run `pytest tests/models/test_cls_models.py`.
diff --git a/tests/models/embedding/language/test_embedding.py b/tests/models/embedding/language/test_embedding.py
index e17198e385475..ad6385376dc8d 100644
--- a/tests/models/embedding/language/test_embedding.py
+++ b/tests/models/embedding/language/test_embedding.py
@@ -1,3 +1,4 @@
+# SPDX-License-Identifier: Apache-2.0
 """Compare the embedding outputs of HF and vLLM models.
 
 Run `pytest tests/models/embedding/language/test_embedding.py`.
diff --git a/tests/models/embedding/language/test_gritlm.py b/tests/models/embedding/language/test_gritlm.py
index 55c2e5d4ed412..7ed2fb8a6358c 100644
--- a/tests/models/embedding/language/test_gritlm.py
+++ b/tests/models/embedding/language/test_gritlm.py
@@ -1,3 +1,5 @@
+# SPDX-License-Identifier: Apache-2.0
+
 import importlib.util
 import math
 from array import array
diff --git a/tests/models/embedding/language/test_scoring.py b/tests/models/embedding/language/test_scoring.py
index be6e3842821e2..d6408258ffce9 100644
--- a/tests/models/embedding/language/test_scoring.py
+++ b/tests/models/embedding/language/test_scoring.py
@@ -1,3 +1,4 @@
+# SPDX-License-Identifier: Apache-2.0
 """Compare the scoring outputs of HF and vLLM models.
 
 Run `pytest tests/models/embedding/language/test_scoring.py`.
@@ -5,12 +6,18 @@ Run `pytest tests/models/embedding/language/test_scoring.py`.
 import math
 
 import pytest
+import torch
+import torch.nn.functional as F
 
 MODELS = [
     "cross-encoder/ms-marco-MiniLM-L-6-v2",  # Bert
     "BAAI/bge-reranker-v2-m3",  # Roberta
 ]
 
+EMBEDDING_MODELS = [
+    "sentence-transformers/all-MiniLM-L12-v2",
+]
+
 TEXTS_1 = [
     "What is the capital of France?",
     "What is the capital of Germany?",
@@ -87,3 +94,97 @@ def test_llm_N_to_N(vllm_runner, hf_runner, model_name, dtype: str):
 
     assert math.isclose(hf_outputs[0], vllm_outputs[0], rel_tol=0.01)
     assert math.isclose(hf_outputs[1], vllm_outputs[1], rel_tol=0.01)
+
+
+@pytest.fixture(scope="module", params=EMBEDDING_MODELS)
+def emb_model_name(request):
+    yield request.param
+
+
+@pytest.mark.parametrize("dtype", ["half"])
+def test_llm_1_to_1_embedding(vllm_runner, hf_runner, emb_model_name,
+                              dtype: str):
+
+    text_pair = [TEXTS_1[0], TEXTS_2[0]]
+
+    with hf_runner(emb_model_name, dtype=dtype,
+                   is_sentence_transformer=True) as hf_model:
+        hf_embeddings = hf_model.encode(text_pair)
+        hf_outputs = [
+            F.cosine_similarity(*map(torch.tensor, hf_embeddings), dim=0)
+        ]
+
+    with vllm_runner(emb_model_name,
+                     task="embed",
+                     dtype=dtype,
+                     max_model_len=None) as vllm_model:
+        vllm_outputs = vllm_model.score(text_pair[0], text_pair[1])
+
+    assert len(vllm_outputs) == 1
+    assert len(hf_outputs) == 1
+
+    assert math.isclose(hf_outputs[0], vllm_outputs[0], rel_tol=0.01)
+
+
+@pytest.mark.parametrize("dtype", ["half"])
+def test_llm_1_to_N_embedding(vllm_runner, hf_runner, emb_model_name,
+                              dtype: str):
+
+    text_pairs = [
+        [TEXTS_1[0], TEXTS_2[0]],
+        [TEXTS_1[0], TEXTS_2[1]],
+    ]
+
+    with hf_runner(emb_model_name, dtype=dtype,
+                   is_sentence_transformer=True) as hf_model:
+        hf_embeddings = [
+            hf_model.encode(text_pair) for text_pair in text_pairs
+        ]
+        hf_outputs = [
+            F.cosine_similarity(*map(torch.tensor, pair), dim=0)
+            for pair in hf_embeddings
+        ]
+
+    with vllm_runner(emb_model_name,
+                     task="embed",
+                     dtype=dtype,
+                     max_model_len=None) as vllm_model:
+        vllm_outputs = vllm_model.score(TEXTS_1[0], TEXTS_2)
+
+    assert len(vllm_outputs) == 2
+    assert len(hf_outputs) == 2
+
+    assert math.isclose(hf_outputs[0], vllm_outputs[0], rel_tol=0.01)
+    assert math.isclose(hf_outputs[1], vllm_outputs[1], rel_tol=0.01)
+
+
+@pytest.mark.parametrize("dtype", ["half"])
+def test_llm_N_to_N_embedding(vllm_runner, hf_runner, emb_model_name,
+                              dtype: str):
+
+    text_pairs = [
+        [TEXTS_1[0], TEXTS_2[0]],
+        [TEXTS_1[1], TEXTS_2[1]],
+    ]
+
+    with hf_runner(emb_model_name, dtype=dtype,
+                   is_sentence_transformer=True) as hf_model:
+        hf_embeddings = [
+            hf_model.encode(text_pair) for text_pair in text_pairs
+        ]
+        hf_outputs = [
+            F.cosine_similarity(*map(torch.tensor, pair), dim=0)
+            for pair in hf_embeddings
+        ]
+
+    with vllm_runner(emb_model_name,
+                     task="embed",
+                     dtype=dtype,
+                     max_model_len=None) as vllm_model:
+        vllm_outputs = vllm_model.score(TEXTS_1, TEXTS_2)
+
+    assert len(vllm_outputs) == 2
+    assert len(hf_outputs) == 2
+
+    assert math.isclose(hf_outputs[0], vllm_outputs[0], rel_tol=0.01)
+    assert math.isclose(hf_outputs[1], vllm_outputs[1], rel_tol=0.01)
diff --git a/tests/models/embedding/utils.py b/tests/models/embedding/utils.py
index f96c7d2b176db..567aa50984937 100644
--- a/tests/models/embedding/utils.py
+++ b/tests/models/embedding/utils.py
@@ -1,3 +1,5 @@
+# SPDX-License-Identifier: Apache-2.0
+
 from typing import List, Sequence
 
 import torch
diff --git a/tests/models/embedding/vision_language/test_dse_qwen2_vl.py b/tests/models/embedding/vision_language/test_dse_qwen2_vl.py
index 2641987b25a3a..82f2bf53122af 100644
--- a/tests/models/embedding/vision_language/test_dse_qwen2_vl.py
+++ b/tests/models/embedding/vision_language/test_dse_qwen2_vl.py
@@ -1,3 +1,5 @@
+# SPDX-License-Identifier: Apache-2.0
+
 from functools import partial
 from typing import Callable, Dict, List, Type
 
diff --git a/tests/models/embedding/vision_language/test_llava_next.py b/tests/models/embedding/vision_language/test_llava_next.py
index f4cd8b81a0d7d..6ba3c5403896c 100644
--- a/tests/models/embedding/vision_language/test_llava_next.py
+++ b/tests/models/embedding/vision_language/test_llava_next.py
@@ -1,3 +1,5 @@
+# SPDX-License-Identifier: Apache-2.0
+
 from typing import List, Type
 
 import pytest
diff --git a/tests/models/embedding/vision_language/test_phi3v.py b/tests/models/embedding/vision_language/test_phi3v.py
index 9374c23dd6ffe..0cb9487460425 100644
--- a/tests/models/embedding/vision_language/test_phi3v.py
+++ b/tests/models/embedding/vision_language/test_phi3v.py
@@ -1,3 +1,5 @@
+# SPDX-License-Identifier: Apache-2.0
+
 from typing import List, Type
 
 import pytest
diff --git a/tests/models/encoder_decoder/audio_language/test_whisper.py b/tests/models/encoder_decoder/audio_language/test_whisper.py
index eb238c5332139..80d6897da7e02 100644
--- a/tests/models/encoder_decoder/audio_language/test_whisper.py
+++ b/tests/models/encoder_decoder/audio_language/test_whisper.py
@@ -1,3 +1,4 @@
+# SPDX-License-Identifier: Apache-2.0
 """Compare the outputs of HF and vLLM for Whisper models using greedy sampling.
 
 Run `pytest tests/models/encoder_decoder/audio/test_whisper.py`.
diff --git a/tests/models/encoder_decoder/language/test_bart.py b/tests/models/encoder_decoder/language/test_bart.py
index 10aba8427944f..81b629fdcf1f7 100644
--- a/tests/models/encoder_decoder/language/test_bart.py
+++ b/tests/models/encoder_decoder/language/test_bart.py
@@ -1,3 +1,4 @@
+# SPDX-License-Identifier: Apache-2.0
 """Compare the outputs of HF and vLLM for BART models using greedy sampling.
 
 Run `pytest tests/models/encoder_decoder/language/test_bart.py`.
diff --git a/tests/models/encoder_decoder/vision_language/test_broadcast.py b/tests/models/encoder_decoder/vision_language/test_broadcast.py
index 542f41a388596..8d986414eec86 100644
--- a/tests/models/encoder_decoder/vision_language/test_broadcast.py
+++ b/tests/models/encoder_decoder/vision_language/test_broadcast.py
@@ -1,3 +1,5 @@
+# SPDX-License-Identifier: Apache-2.0
+
 import pytest
 
 from ....utils import multi_gpu_test
diff --git a/tests/models/encoder_decoder/vision_language/test_florence2.py b/tests/models/encoder_decoder/vision_language/test_florence2.py
index d686f1da3fa17..a1d15679918b1 100644
--- a/tests/models/encoder_decoder/vision_language/test_florence2.py
+++ b/tests/models/encoder_decoder/vision_language/test_florence2.py
@@ -1,3 +1,5 @@
+# SPDX-License-Identifier: Apache-2.0
+
 from functools import partial
 from typing import List, Optional, Tuple, Type
 
diff --git a/tests/models/encoder_decoder/vision_language/test_mllama.py b/tests/models/encoder_decoder/vision_language/test_mllama.py
index 636a3eedff31b..4cd2dbdb4f988 100644
--- a/tests/models/encoder_decoder/vision_language/test_mllama.py
+++ b/tests/models/encoder_decoder/vision_language/test_mllama.py
@@ -1,11 +1,17 @@
+# SPDX-License-Identifier: Apache-2.0
+
 from typing import List, Optional, Tuple, Type, overload
 
 import pytest
+import torch
 from transformers import (AutoConfig, AutoModelForVision2Seq, AutoTokenizer,
                           BatchEncoding)
 
+from vllm.attention.backends.flash_attn import FlashAttentionMetadata
 from vllm.attention.selector import (_Backend, _cached_get_attn_backend,
                                      global_force_attn_backend_context_manager)
+from vllm.model_executor.models.mllama import (MLLAMA_IMAGE_TOKEN_ID,
+                                               MllamaForConditionalGeneration)
 from vllm.multimodal.image import rescale_image_size
 from vllm.sequence import SampleLogprobs
 
@@ -33,6 +39,29 @@ models = [
     "meta-llama/Llama-3.2-11B-Vision-Instruct",
 ]
 
+# Indices for inputs
+TEXT_ONLY = '0'
+IMAGE_AT_BEG = '1'
+IMAGE_AT_MIDDLE = '2'
+TWO_IMAGES = '3'
+
+# Input tokenized
+prompt_data = {
+    # Tell me a story
+    TEXT_ONLY: [41551, 757, 264, 3446],
+    # <|image|> What's the content of this image
+    IMAGE_AT_BEG:
+    [MLLAMA_IMAGE_TOKEN_ID, 3639, 596, 279, 2262, 315, 420, 2217, 220],
+    # Hello <|image|>What' the content of this image
+    IMAGE_AT_MIDDLE:
+    [9906, 220, MLLAMA_IMAGE_TOKEN_ID, 3923, 6, 279, 2262, 315, 420, 2217],
+    #<|image|>Is there a duck in this image?<|image|>What's the animal in this image? # noqa: E501
+    TWO_IMAGES: [
+        MLLAMA_IMAGE_TOKEN_ID, 3957, 1070, 264, 37085, 304, 420, 2217, 30,
+        MLLAMA_IMAGE_TOKEN_ID, 3923, 596, 279, 10065, 304, 420, 2217, 30
+    ]
+}
+
 
 def vllm_to_hf_output(vllm_output: Tuple[List[int], str,
                                          Optional[SampleLogprobs]],
@@ -365,3 +394,184 @@ def test_models_interleaved_images(hf_runner, vllm_runner, image_assets, model,
             num_logprobs=num_logprobs,
             tensor_parallel_size=1,
         )
+
+
+@large_gpu_test(min_gb=48)
+@pytest.mark.core_model
+@pytest.mark.parametrize("model", models)
+@pytest.mark.parametrize("dtype", ["bfloat16"])
+@pytest.mark.parametrize("max_tokens", [128])
+@pytest.mark.parametrize("num_logprobs", [5])
+@pytest.mark.parametrize("attn_backend", LIST_ENC_DEC_SUPPORTED_BACKENDS)
+def test_regression(vllm_runner, image_assets, model, dtype, max_tokens,
+                    num_logprobs, attn_backend: _Backend) -> None:
+
+    stop_sign = image_assets[0].pil_image
+
+    with global_force_attn_backend_context_manager(attn_backend), vllm_runner(
+            model,
+            dtype=dtype,
+            max_model_len=4096,
+            max_num_seqs=2,
+            tensor_parallel_size=1,
+            enforce_eager=True,
+            limit_mm_per_prompt={"image":
+                                 _LIMIT_IMAGE_PER_PROMPT}) as vllm_model:
+
+        # Regression tests for https://github.com/vllm-project/vllm/issues/10648
+
+        # Number of image tags is greater than the number of images provided
+        prompt = "<|begin_of_text|><|image|><|image|> Compare the two images"  # noqa: E501
+        image = stop_sign
+        with pytest.raises(ValueError):
+            vllm_model.generate_greedy_logprobs([prompt],
+                                                max_tokens,
+                                                num_logprobs,
+                                                images=[image])
+
+        # Batch of a text-only and image request that requires cross-attention
+        prompts = [
+            "What is the capital of spain?",
+            "Text before the image...<|image|>What is in the image?",  # noqa: E501
+        ]
+        images = [
+            None,
+            [stop_sign],
+        ]
+        vllm_model.generate_greedy_logprobs(prompts,
+                                            max_tokens,
+                                            num_logprobs,
+                                            images=images)
+
+        # Test the reverse order too for good measure
+        prompts = [
+            "<|begin_of_text|>Text before the image...<|image|>What is in the image?",  # noqa: E501
+            "<|begin_of_text|>Hello!",
+        ]
+        images = [
+            [stop_sign],
+            None,
+        ]
+        vllm_model.generate_greedy_logprobs(prompts,
+                                            max_tokens,
+                                            num_logprobs,
+                                            images=images)
+
+
+@pytest.mark.core_model
+@pytest.mark.parametrize(
+    "input_indices_and_output",
+    # inputs, (cross_attention_mask, kv_range_for_decode)
+    [([TEXT_ONLY], (None, None)), ([IMAGE_AT_BEG], (None, None)),
+     ([TEXT_ONLY, IMAGE_AT_BEG], (None, None)),
+     ([IMAGE_AT_MIDDLE], ((10, 12), [[0, 6]])),
+     ([TEXT_ONLY, IMAGE_AT_MIDDLE], ((14, 12), [[0, 6]])),
+     ([TEXT_ONLY, IMAGE_AT_BEG, IMAGE_AT_MIDDLE],
+      ((23, 24), [[0, 6], [6, 12]])),
+     ([IMAGE_AT_MIDDLE, TEXT_ONLY], ((14, 12), [[0, 6]])),
+     ([TWO_IMAGES], ((18, 12), [[6, 12]])),
+     ([TEXT_ONLY, TWO_IMAGES], ((22, 12), [[6, 12]]))])
+def test_get_cross_attention_mask(input_indices_and_output) -> None:
+
+    input_indices, expected_output = input_indices_and_output
+
+    sequences = [torch.tensor(prompt_data[i]) for i in input_indices]
+    num_tiles = [[2, 2] if i != TEXT_ONLY else [] for i in input_indices
+                 if i != TEXT_ONLY]
+    input = torch.cat(sequences)
+
+    seq_lens = [len(s) for s in sequences]
+
+    attn_data = FlashAttentionMetadata(
+        seq_lens=seq_lens,
+        # Dummy values
+        enable_kv_scales_calculation=False,
+        num_prefills=0,
+        num_prefill_tokens=0,
+        num_decode_tokens=0,
+        slot_mapping=0,
+        multi_modal_placeholder_index_maps=None,
+        seq_lens_tensor=0,
+        max_prefill_seq_len=0,
+        max_decode_seq_len=0,
+        context_lens_tensor=None,
+        block_tables=None,
+        use_cuda_graph=False,
+    )
+
+    dummy: dict[str, str] = {}
+
+    cross_attention_mask, kv_range_for_decode = MllamaForConditionalGeneration\
+        .get_cross_attention_mask(dummy,
+                                  input,
+                                  attn_data,
+                                  num_tiles=num_tiles,
+                                  num_tokens_per_tile=3,
+                                  dtype=torch.bfloat16)
+
+    expected_cross_attention_mask, expected_kv_range_for_decode = \
+        expected_output
+
+    assert kv_range_for_decode == expected_kv_range_for_decode
+    if expected_cross_attention_mask is not None:
+        assert cross_attention_mask is not None
+        assert cross_attention_mask.shape == expected_cross_attention_mask
+    else:
+        assert cross_attention_mask is None
+
+
+@pytest.mark.core_model
+@pytest.mark.parametrize(
+    "input_indices",
+    [[TEXT_ONLY], [IMAGE_AT_BEG], [TEXT_ONLY, IMAGE_AT_BEG], [IMAGE_AT_MIDDLE],
+     [TEXT_ONLY, IMAGE_AT_MIDDLE], [TEXT_ONLY, IMAGE_AT_BEG, IMAGE_AT_MIDDLE],
+     [IMAGE_AT_MIDDLE, TEXT_ONLY], [TWO_IMAGES], [TEXT_ONLY, TWO_IMAGES]])
+def test_get_full_text_row_masked_out_mask(input_indices) -> None:
+
+    sequences = [torch.tensor(prompt_data[i]) for i in input_indices]
+
+    seq_lens = [len(s) for s in sequences]
+
+    num_prefill_tokens = sum(seq_lens)
+
+    # TEXT_ONLY is zero, so it will be masked out,
+    # other instances should not be.
+    encoder_seq_lens = [int(i) for i in input_indices]
+
+    attn_data = FlashAttentionMetadata(
+        seq_lens=seq_lens,
+        encoder_seq_lens=encoder_seq_lens,
+        num_prefill_tokens=num_prefill_tokens,
+        # Dummy values
+        enable_kv_scales_calculation=False,
+        num_prefills=0,
+        num_decode_tokens=0,
+        slot_mapping=0,
+        multi_modal_placeholder_index_maps=None,
+        seq_lens_tensor=0,
+        max_prefill_seq_len=0,
+        max_decode_seq_len=0,
+        context_lens_tensor=None,
+        block_tables=None,
+        use_cuda_graph=False,
+    )
+
+    dummy: dict[str, str] = {}
+
+    full_text_row_masked_out_mask = MllamaForConditionalGeneration\
+        .get_full_text_row_masked_out_mask(dummy,
+                                  attn_data,
+                                  torch.get_default_device())
+
+    full_text_row_masked_out_mask = full_text_row_masked_out_mask.squeeze()
+    full_text_row_masked_out_mask = full_text_row_masked_out_mask.tolist()
+
+    idx = 0
+    assert len(full_text_row_masked_out_mask) == num_prefill_tokens
+    for i, seq_len in enumerate(seq_lens):
+        must_be_masked = input_indices[i] != TEXT_ONLY
+        for _ in range(seq_len):
+            assert full_text_row_masked_out_mask[idx] == must_be_masked, \
+                f"full_text_row_masked_out_mask[{idx}] must be " \
+                f"'{must_be_masked}' "
+            idx += 1
diff --git a/tests/models/multimodal/processing/test_common.py b/tests/models/multimodal/processing/test_common.py
index fe5b733c750a8..3921d4e19dd2b 100644
--- a/tests/models/multimodal/processing/test_common.py
+++ b/tests/models/multimodal/processing/test_common.py
@@ -1,3 +1,5 @@
+# SPDX-License-Identifier: Apache-2.0
+
 from functools import partial
 
 import numpy as np
@@ -16,7 +18,6 @@ from ...registry import HF_EXAMPLE_MODELS
 
 def _test_processing_correctness(
     model_id: str,
-    modalities: dict[str, bool],
     hit_rate: float,
     num_batches: int,
     simplify_rate: float,
@@ -25,11 +26,6 @@ def _test_processing_correctness(
     model_info.check_available_online(on_fail="skip")
     model_info.check_transformers_version(on_fail="skip")
 
-    limit_mm_per_prompt = {
-        modality: 3 if supports_multi else 1
-        for modality, supports_multi in modalities.items()
-    }
-
     model_config = ModelConfig(
         model_id,
         task="auto",
@@ -40,18 +36,29 @@ def _test_processing_correctness(
         dtype="float16",
         revision=None,
         hf_overrides=model_info.hf_overrides,
-        limit_mm_per_prompt=limit_mm_per_prompt,
     )
 
     model_cls = MULTIMODAL_REGISTRY._get_model_cls(model_config)
     factories = MULTIMODAL_REGISTRY._processor_factories[model_cls]
     ctx = InputProcessingContext(
         model_config,
-        tokenizer=cached_get_tokenizer(model_config.tokenizer),
+        tokenizer=cached_get_tokenizer(
+            model_config.tokenizer,
+            trust_remote_code=model_info.trust_remote_code,
+        ),
     )
     # Ensure that it can fit all of the data
     cache = ProcessingCache(capacity=1 << 30)
 
+    processing_info = factories.info(ctx)
+    supported_mm_limits = processing_info.get_supported_mm_limits()
+    limit_mm_per_prompt = {
+        modality: 3 if limit is None else limit
+        for modality, limit in supported_mm_limits.items()
+    }
+
+    model_config.get_multimodal_config().limit_per_prompt = limit_mm_per_prompt
+
     baseline_processor = factories.build_processor(ctx, cache=None)
     cached_processor = factories.build_processor(ctx, cache=cache)
     dummy_inputs = baseline_processor.dummy_inputs
@@ -82,8 +89,8 @@ def _test_processing_correctness(
         mm_data = {
             k:
             [(input_to_hit[k] if rng.rand() < hit_rate else input_factory[k]())
-             for _ in range(rng.randint(limit_mm_per_prompt[k]))]
-            for k in modalities
+             for _ in range(rng.randint(limit))]
+            for k, limit in limit_mm_per_prompt.items()
         }
 
         mm_counts = {k: len(vs) for k, vs in mm_data.items()}
@@ -135,21 +142,24 @@ def _test_processing_correctness(
 
 # yapf: disable
 # True if the model supports multiple data items of the modality per request
-@pytest.mark.parametrize(("model_id", "modalities"), [
-    ("rhymes-ai/Aria", {"image": True}),
-    ("Salesforce/blip2-opt-2.7b", {"image": False}),
-    ("facebook/chameleon-7b", {"image": False}),
-    ("deepseek-ai/deepseek-vl2-tiny", {"image": True}),
-    ("adept/fuyu-8b", {"image": False}),
-    ("llava-hf/llava-1.5-7b-hf", {"image": True}),
-    ("llava-hf/llava-v1.6-mistral-7b-hf", {"image": True}),
-    ("llava-hf/LLaVA-NeXT-Video-7B-hf", {"video": False}),
-    ("llava-hf/llava-onevision-qwen2-0.5b-ov-hf", {"image": True, "video": True}),  # noqa: E501
-    ("TIGER-Lab/Mantis-8B-siglip-llama3", {"image": True}),
-    ("mistral-community/pixtral-12b", {"image": True}),
-    ("Qwen/Qwen2-VL-2B-Instruct", {"image": True, "video": True}),
-    ("Qwen/Qwen2-Audio-7B-Instruct", {"audio": True}),
-    ("fixie-ai/ultravox-v0_3", {"audio": True}),
+@pytest.mark.parametrize("model_id", [
+    "rhymes-ai/Aria",
+    "Salesforce/blip2-opt-2.7b",
+    "facebook/chameleon-7b",
+    "deepseek-ai/deepseek-vl2-tiny",
+    "adept/fuyu-8b",
+    "llava-hf/llava-1.5-7b-hf",
+    "llava-hf/llava-v1.6-mistral-7b-hf",
+    "llava-hf/LLaVA-NeXT-Video-7B-hf",
+    "llava-hf/llava-onevision-qwen2-0.5b-ov-hf",
+    "TIGER-Lab/Mantis-8B-siglip-llama3",
+    "mistral-community/pixtral-12b",
+    "openbmb/MiniCPM-o-2_6",
+    "openbmb/MiniCPM-V-2_6",
+    "Qwen/Qwen-VL-Chat",
+    "Qwen/Qwen2-VL-2B-Instruct",
+    "Qwen/Qwen2-Audio-7B-Instruct",
+    "fixie-ai/ultravox-v0_3",
 ])
 @pytest.mark.parametrize("hit_rate", [0.3, 0.5, 1.0])
 @pytest.mark.parametrize("num_batches", [32])
@@ -157,14 +167,12 @@ def _test_processing_correctness(
 # yapf: enable
 def test_processing_correctness(
     model_id: str,
-    modalities: dict[str, bool],
     hit_rate: float,
     num_batches: int,
     simplify_rate: float,
 ):
     _test_processing_correctness(
         model_id,
-        modalities,
         hit_rate=hit_rate,
         num_batches=num_batches,
         simplify_rate=simplify_rate,
@@ -172,16 +180,13 @@ def test_processing_correctness(
 
 
 # yapf: disable
-@pytest.mark.parametrize(("model_id", "modalities"), [
-    ("microsoft/Phi-3-vision-128k-instruct", {"image": True}),
-])
+@pytest.mark.parametrize("model_id", ["microsoft/Phi-3-vision-128k-instruct"])
 @pytest.mark.parametrize("hit_rate", [0.3, 0.5, 1.0])
 @pytest.mark.parametrize("num_batches", [32])
 @pytest.mark.parametrize("simplify_rate", [1.0])
 # yapf: enable
 def test_processing_correctness_phi3v(
     model_id: str,
-    modalities: dict[str, bool],
     hit_rate: float,
     num_batches: int,
     simplify_rate: float,
@@ -195,7 +200,6 @@ def test_processing_correctness_phi3v(
 
     _test_processing_correctness(
         model_id,
-        modalities,
         hit_rate=hit_rate,
         num_batches=num_batches,
         simplify_rate=simplify_rate,
diff --git a/tests/models/multimodal/processing/test_idefics3.py b/tests/models/multimodal/processing/test_idefics3.py
index 69b91ad4a5df8..00c1dae511586 100644
--- a/tests/models/multimodal/processing/test_idefics3.py
+++ b/tests/models/multimodal/processing/test_idefics3.py
@@ -1,3 +1,4 @@
+# SPDX-License-Identifier: Apache-2.0
 """Tests for Idefics3's multimodal preprocessing kwargs."""
 from typing import Optional
 
diff --git a/tests/models/multimodal/processing/test_internvl.py b/tests/models/multimodal/processing/test_internvl.py
index d6c60595ca5ea..0d921e9d32961 100644
--- a/tests/models/multimodal/processing/test_internvl.py
+++ b/tests/models/multimodal/processing/test_internvl.py
@@ -1,3 +1,4 @@
+# SPDX-License-Identifier: Apache-2.0
 """Tests for InternVL's multimodal preprocessing kwargs."""
 from typing import Callable, Optional
 
diff --git a/tests/models/multimodal/processing/test_llava_next.py b/tests/models/multimodal/processing/test_llava_next.py
index 6de649f87204d..d2497e62d91b2 100644
--- a/tests/models/multimodal/processing/test_llava_next.py
+++ b/tests/models/multimodal/processing/test_llava_next.py
@@ -1,3 +1,5 @@
+# SPDX-License-Identifier: Apache-2.0
+
 import itertools
 from functools import partial
 
diff --git a/tests/models/multimodal/processing/test_llava_onevision.py b/tests/models/multimodal/processing/test_llava_onevision.py
index 806437d35ec87..bd4dbd46da4c2 100644
--- a/tests/models/multimodal/processing/test_llava_onevision.py
+++ b/tests/models/multimodal/processing/test_llava_onevision.py
@@ -1,3 +1,5 @@
+# SPDX-License-Identifier: Apache-2.0
+
 import itertools
 from functools import partial
 
diff --git a/tests/models/multimodal/processing/test_phi3v.py b/tests/models/multimodal/processing/test_phi3v.py
index 7f82a8f18f0ca..44edec457a662 100644
--- a/tests/models/multimodal/processing/test_phi3v.py
+++ b/tests/models/multimodal/processing/test_phi3v.py
@@ -1,3 +1,4 @@
+# SPDX-License-Identifier: Apache-2.0
 """Tests for phi3v's multimodal preprocessing kwargs."""
 import pytest
 
diff --git a/tests/models/multimodal/processing/test_qwen.py b/tests/models/multimodal/processing/test_qwen.py
deleted file mode 100644
index af0ace711ba3e..0000000000000
--- a/tests/models/multimodal/processing/test_qwen.py
+++ /dev/null
@@ -1,144 +0,0 @@
-"""Tests for Qwen's multimodal preprocessing kwargs."""
-from typing import Dict, List, Union
-
-import pytest
-import torch
-from PIL.Image import Image
-
-from vllm.inputs import InputContext, token_inputs
-from vllm.multimodal import MultiModalKwargs
-from vllm.multimodal.utils import cached_get_tokenizer
-
-from ....conftest import IMAGE_ASSETS
-from ...utils import build_model_context
-
-### Multimodal preprocessing tests
-SAMPLE_IMAGE = IMAGE_ASSETS[0].pil_image
-# These values are specific to Qwen-VL/Chat; we can get these from the model
-# config also, but they are hardcoded here to keep the parameterize/fixtures
-# easy to read.
-IMG_START_ID = 151857
-IMG_END_ID = 151858
-IMG_PAD_ID = 151859
-TOKS_PER_IMG = 256
-VIS_ENC_DIM = 4096
-IMG_SIZE = 448
-
-
-@pytest.fixture()
-def input_mapper_for_qwen():
-    # Lazy import to avoid initializing CUDA during test collection
-    from vllm.model_executor.models.qwen import input_mapper_for_qwen
-    return input_mapper_for_qwen
-
-
-@pytest.fixture()
-def input_processor_for_qwen():
-    # Lazy import to avoid initializing CUDA during test collection
-    from vllm.model_executor.models.qwen import input_processor_for_qwen
-    return input_processor_for_qwen
-
-
-@pytest.fixture()
-def qwen_vl_context() -> InputContext:
-    """Get an InputContext for Qwen-VL."""
-    return build_model_context(model_name="Qwen/Qwen-VL",
-                               trust_remote_code=True)
-
-
-# Happy path tests for single/multi-image scenarios for the multimodal
-# input processor and mapper, respectively
-@pytest.mark.parametrize("num_images", [1, 2])
-def test_input_processor_valid_mm_data(input_processor_for_qwen,
-                                       qwen_vl_context: InputContext,
-                                       num_images: int):
-    """Happy cases for image inputs to Qwen's multimodal input processor."""
-    prompt = "".join(
-        [f"Picture {num}: <img></img>\n" for num in range(1, num_images + 1)])
-    inputs = token_inputs(
-        prompt=prompt,
-        # When processing multimodal data for a multimodal model, the qwen
-        # input processor will overwrite the provided prompt_token_ids with
-        # the image prompts
-        prompt_token_ids=[],
-        multi_modal_data={"image": torch.rand(num_images, TOKS_PER_IMG, 4096)},
-    )
-    proc_inputs = input_processor_for_qwen(qwen_vl_context, inputs)
-    assert isinstance(proc_inputs, dict)
-
-    # Each image should have one start / stop and a fixed context of 256
-    proc_tokens = proc_inputs["prompt_token_ids"]
-    assert proc_tokens.count(IMG_START_ID) == num_images
-    assert proc_tokens.count(IMG_END_ID) == num_images
-    assert proc_tokens.count(IMG_PAD_ID) == num_images * TOKS_PER_IMG
-
-
-@pytest.mark.parametrize(
-    "img_data,expected_shape",
-    [
-        # single / multi-image
-        (SAMPLE_IMAGE, (1, 3, IMG_SIZE, IMG_SIZE)),
-        (2 * [SAMPLE_IMAGE], (2, 3, IMG_SIZE, IMG_SIZE)),
-        # single / multi-image embeddings
-        (torch.rand(
-            (TOKS_PER_IMG, VIS_ENC_DIM)), (1, TOKS_PER_IMG, VIS_ENC_DIM)),
-        (torch.rand(
-            (1, TOKS_PER_IMG, VIS_ENC_DIM)), (1, TOKS_PER_IMG, VIS_ENC_DIM)),
-        (torch.rand(
-            (2, TOKS_PER_IMG, VIS_ENC_DIM)), (2, TOKS_PER_IMG, VIS_ENC_DIM)),
-    ])
-def test_input_mapper_valid_mm_data(input_mapper_for_qwen,
-                                    qwen_vl_context: InputContext,
-                                    img_data: Union[torch.Tensor, List[Image],
-                                                    Image],
-                                    expected_shape: List[int]):
-    """Happy cases for image inputs to Qwen's multimodal input mapper."""
-    mapped_img_data = input_mapper_for_qwen(qwen_vl_context, img_data)
-    # Ensure that we get the appropriately shaped pixel_values
-    # for images and image embeddings, respectively.
-    assert isinstance(mapped_img_data, MultiModalKwargs)
-    assert "pixel_values" in mapped_img_data
-    assert mapped_img_data["pixel_values"].shape == expected_shape
-
-
-# Sad path tests for the multimodal input processor and mapper, respectively
-@pytest.mark.parametrize("mm_data", [
-    {
-        "image": torch.rand(5)
-    },
-    {
-        "image": torch.rand((5, 5, 5, 5, 5))
-    },
-])
-def test_input_processor_invalid_mm_data(input_processor_for_qwen,
-                                         qwen_vl_context: InputContext,
-                                         mm_data: Dict[str, torch.Tensor]):
-    """Test sad cases validated in Qwen's multimodal input processor."""
-    tokenizer = cached_get_tokenizer(qwen_vl_context.model_config.tokenizer,
-                                     trust_remote_code=True)
-    prompt = "Picture 1: <img></img>\n"
-    prompt_token_ids = tokenizer.encode(prompt)
-    inputs = token_inputs(prompt=prompt,
-                          prompt_token_ids=prompt_token_ids,
-                          multi_modal_data=mm_data)
-    # Should fail since we have too many or too few dimensions for embeddings
-    with pytest.raises(ValueError):
-        input_processor_for_qwen(qwen_vl_context, inputs)
-
-
-@pytest.mark.parametrize(
-    "img_data",
-    [
-        # Wrong context length
-        torch.rand((1, TOKS_PER_IMG + 10, VIS_ENC_DIM)),
-        # Wrong visual encoder output size
-        torch.rand((1, TOKS_PER_IMG, VIS_ENC_DIM + 10)),
-    ])
-def test_input_mapper_invalid_mm_data(
-    input_mapper_for_qwen,
-    qwen_vl_context: InputContext,
-    img_data: Union[torch.Tensor, List[Image], Image],
-):
-    """Sad cases validated in Qwen VL's multimodal input mapper."""
-    with pytest.raises(ValueError):
-        input_mapper_for_qwen(qwen_vl_context, img_data)
diff --git a/tests/models/multimodal/processing/test_qwen2_vl.py b/tests/models/multimodal/processing/test_qwen2_vl.py
index de14fbbffe5b7..47c9b0add55ab 100644
--- a/tests/models/multimodal/processing/test_qwen2_vl.py
+++ b/tests/models/multimodal/processing/test_qwen2_vl.py
@@ -1,3 +1,5 @@
+# SPDX-License-Identifier: Apache-2.0
+
 import pytest
 
 from vllm.multimodal import MULTIMODAL_REGISTRY
diff --git a/tests/models/registry.py b/tests/models/registry.py
index 0bd06dea0ec7f..8a0ade4fa2074 100644
--- a/tests/models/registry.py
+++ b/tests/models/registry.py
@@ -1,3 +1,5 @@
+# SPDX-License-Identifier: Apache-2.0
+
 from dataclasses import dataclass, field
 from typing import AbstractSet, Any, Literal, Mapping, Optional
 
@@ -245,7 +247,9 @@ _MULTIMODAL_EXAMPLE_MODELS = {
     "LlavaOnevisionForConditionalGeneration": _HfExamplesInfo("llava-hf/llava-onevision-qwen2-0.5b-ov-hf"),  # noqa: E501
     "MantisForConditionalGeneration": _HfExamplesInfo("TIGER-Lab/Mantis-8B-siglip-llama3",  # noqa: E501
                                                       hf_overrides={"architectures": ["MantisForConditionalGeneration"]}),  # noqa: E501
-    "MiniCPMV": _HfExamplesInfo("openbmb/MiniCPM-Llama3-V-2_5",
+    "MiniCPMO": _HfExamplesInfo("openbmb/MiniCPM-o-2_6",
+                                trust_remote_code=True),
+    "MiniCPMV": _HfExamplesInfo("openbmb/MiniCPM-V-2_6",
                                 trust_remote_code=True),
     "MolmoForCausalLM": _HfExamplesInfo("allenai/Molmo-7B-D-0924",
                                         trust_remote_code=True),
@@ -277,12 +281,17 @@ _SPECULATIVE_DECODING_EXAMPLE_MODELS = {
                                                     speculative_model="ibm-fms/llama-160m-accelerator"),  # noqa: E501
 }
 
+_FALLBACK_MODEL = {
+    "TransformersModel": _HfExamplesInfo("ArthurZ/Ilama-3.2-1B", trust_remote_code=True),  # noqa: E501
+}
+
 _EXAMPLE_MODELS = {
     **_TEXT_GENERATION_EXAMPLE_MODELS,
     **_EMBEDDING_EXAMPLE_MODELS,
     **_CROSS_ENCODER_EXAMPLE_MODELS,
     **_MULTIMODAL_EXAMPLE_MODELS,
     **_SPECULATIVE_DECODING_EXAMPLE_MODELS,
+    **_FALLBACK_MODEL,
 }
 
 
diff --git a/tests/models/test_initialization.py b/tests/models/test_initialization.py
index d3a3aaf670c23..64928a65d8565 100644
--- a/tests/models/test_initialization.py
+++ b/tests/models/test_initialization.py
@@ -1,3 +1,5 @@
+# SPDX-License-Identifier: Apache-2.0
+
 from unittest.mock import patch
 
 import pytest
diff --git a/tests/models/test_oot_registration.py b/tests/models/test_oot_registration.py
index 2c413a633896a..f2a505596ce69 100644
--- a/tests/models/test_oot_registration.py
+++ b/tests/models/test_oot_registration.py
@@ -1,3 +1,5 @@
+# SPDX-License-Identifier: Apache-2.0
+
 import os
 
 import pytest
@@ -13,7 +15,9 @@ def test_plugin(dummy_opt_path):
     os.environ["VLLM_PLUGINS"] = ""
     with pytest.raises(Exception) as excinfo:
         LLM(model=dummy_opt_path, load_format="dummy")
-    assert "are not supported for now" in str(excinfo.value)
+    error_msg = "has no vLLM implementation and " \
+                "the Transformers implementation is not compatible with vLLM."
+    assert (error_msg in str(excinfo.value))
 
 
 @fork_new_process_for_each_test
diff --git a/tests/models/test_registry.py b/tests/models/test_registry.py
index ac0366847e334..80d3f78f9f317 100644
--- a/tests/models/test_registry.py
+++ b/tests/models/test_registry.py
@@ -1,3 +1,5 @@
+# SPDX-License-Identifier: Apache-2.0
+
 import warnings
 
 import pytest
diff --git a/tests/models/test_transformers.py b/tests/models/test_transformers.py
new file mode 100644
index 0000000000000..c6536f37cbdc8
--- /dev/null
+++ b/tests/models/test_transformers.py
@@ -0,0 +1,75 @@
+"""Test the functionality of the Transformers backend.
+
+Run `pytest tests/models/test_transformers.py`.
+"""
+from contextlib import nullcontext
+from typing import Type
+
+import pytest
+
+from ..conftest import HfRunner, VllmRunner
+from ..utils import multi_gpu_test
+from .utils import check_logprobs_close
+
+
+def check_implementation(
+    hf_runner: Type[HfRunner],
+    vllm_runner: Type[VllmRunner],
+    example_prompts: list[str],
+    model: str,
+    **kwargs,
+):
+    max_tokens = 32
+    num_logprobs = 5
+
+    with vllm_runner(model, **kwargs) as vllm_model:
+        vllm_outputs = vllm_model.generate_greedy_logprobs(
+            example_prompts, max_tokens, num_logprobs)
+
+    with hf_runner(model) as hf_model:
+        hf_outputs = hf_model.generate_greedy_logprobs_limit(
+            example_prompts, max_tokens, num_logprobs)
+
+    check_logprobs_close(
+        outputs_0_lst=hf_outputs,
+        outputs_1_lst=vllm_outputs,
+        name_0="hf",
+        name_1="vllm",
+    )
+
+
+@pytest.mark.parametrize(
+    "model,model_impl",
+    [
+        ("meta-llama/Llama-3.2-1B-Instruct", "transformers"),
+        ("openai-community/gpt2", "transformers"),
+        ("ArthurZ/Ilama-3.2-1B", "auto"),  # CUSTOM CODE
+        ("meta-llama/Llama-3.2-1B-Instruct", "auto"),
+    ])  # trust_remote_code=True by default
+def test_models(hf_runner, vllm_runner, example_prompts, model,
+                model_impl) -> None:
+
+    maybe_raises = nullcontext()
+    if model == "openai-community/gpt2" and model_impl == "transformers":
+        # Model is not backend compatible
+        maybe_raises = pytest.raises(
+            ValueError,
+            match="The Transformers implementation.*not compatible with vLLM")
+
+    with maybe_raises:
+        check_implementation(hf_runner,
+                             vllm_runner,
+                             example_prompts,
+                             model,
+                             model_impl=model_impl)
+
+
+@multi_gpu_test(num_gpus=2)
+def test_distributed(
+    hf_runner,
+    vllm_runner,
+    example_prompts,
+):
+    kwargs = {"model_impl": "transformers", "tensor_parallel_size": 2}
+    check_implementation(hf_runner, vllm_runner, example_prompts,
+                         "meta-llama/Llama-3.2-1B-Instruct", **kwargs)
diff --git a/tests/models/utils.py b/tests/models/utils.py
index 0eb3f61f1f047..e2be43c126671 100644
--- a/tests/models/utils.py
+++ b/tests/models/utils.py
@@ -1,3 +1,5 @@
+# SPDX-License-Identifier: Apache-2.0
+
 import warnings
 from typing import Dict, List, Optional, Sequence, Tuple, Union
 
diff --git a/tests/mq_llm_engine/test_abort.py b/tests/mq_llm_engine/test_abort.py
index 782b508a57149..808346b5e58d5 100644
--- a/tests/mq_llm_engine/test_abort.py
+++ b/tests/mq_llm_engine/test_abort.py
@@ -1,3 +1,4 @@
+# SPDX-License-Identifier: Apache-2.0
 """Test that aborting is handled properly."""
 
 import asyncio
diff --git a/tests/mq_llm_engine/test_error_handling.py b/tests/mq_llm_engine/test_error_handling.py
index 83bc4e7cf847e..35d0017811103 100644
--- a/tests/mq_llm_engine/test_error_handling.py
+++ b/tests/mq_llm_engine/test_error_handling.py
@@ -1,3 +1,4 @@
+# SPDX-License-Identifier: Apache-2.0
 """Test that various errors are handled properly."""
 
 import asyncio
diff --git a/tests/mq_llm_engine/test_load.py b/tests/mq_llm_engine/test_load.py
index 630c112d0f0c9..2069ff987f2fe 100644
--- a/tests/mq_llm_engine/test_load.py
+++ b/tests/mq_llm_engine/test_load.py
@@ -1,3 +1,4 @@
+# SPDX-License-Identifier: Apache-2.0
 """Test that the MQLLMEngine is able to handle 10k concurrent requests."""
 
 import asyncio
diff --git a/tests/mq_llm_engine/utils.py b/tests/mq_llm_engine/utils.py
index f717c1355431c..11e44f12bc56f 100644
--- a/tests/mq_llm_engine/utils.py
+++ b/tests/mq_llm_engine/utils.py
@@ -1,3 +1,5 @@
+# SPDX-License-Identifier: Apache-2.0
+
 import asyncio
 import multiprocessing
 from typing import Callable, Tuple, Union
diff --git a/tests/multi_step/test_correctness_async_llm.py b/tests/multi_step/test_correctness_async_llm.py
index 8456a463adeeb..9822cee14a250 100644
--- a/tests/multi_step/test_correctness_async_llm.py
+++ b/tests/multi_step/test_correctness_async_llm.py
@@ -1,3 +1,5 @@
+# SPDX-License-Identifier: Apache-2.0
+
 # Test the AsyncLLMEngine with multi-step-decoding
 from typing import List, Optional
 
@@ -16,7 +18,8 @@ NUM_SCHEDULER_STEPS = [8]  # Multi-step decoding steps
 NUM_PROMPTS = [10]
 
 DEFAULT_SERVER_ARGS: List[str] = [
-    "--worker-use-ray",
+    "--distributed-executor-backend",
+    "ray",
     "--gpu-memory-utilization",
     "0.85",
     "--swap-space",
diff --git a/tests/multi_step/test_correctness_llm.py b/tests/multi_step/test_correctness_llm.py
index 34030d9d6ac60..29d5ffd4c9cb1 100644
--- a/tests/multi_step/test_correctness_llm.py
+++ b/tests/multi_step/test_correctness_llm.py
@@ -1,3 +1,5 @@
+# SPDX-License-Identifier: Apache-2.0
+
 # Test the LLMEngine with multi-step-decoding
 
 import copy
diff --git a/tests/multimodal/test_inputs.py b/tests/multimodal/test_inputs.py
index 678bbb52b8c2f..f5d3e282f953d 100644
--- a/tests/multimodal/test_inputs.py
+++ b/tests/multimodal/test_inputs.py
@@ -1,3 +1,5 @@
+# SPDX-License-Identifier: Apache-2.0
+
 import torch
 
 from vllm.multimodal.inputs import MultiModalKwargs, NestedTensors
diff --git a/tests/multimodal/test_processing.py b/tests/multimodal/test_processing.py
index 13f820d013e2a..6cccd2aa2323a 100644
--- a/tests/multimodal/test_processing.py
+++ b/tests/multimodal/test_processing.py
@@ -1,3 +1,5 @@
+# SPDX-License-Identifier: Apache-2.0
+
 from contextlib import nullcontext
 from typing import cast
 from unittest.mock import MagicMock
diff --git a/tests/multimodal/test_processor_kwargs.py b/tests/multimodal/test_processor_kwargs.py
index d141cdf1f083b..5d18b2ed75660 100644
--- a/tests/multimodal/test_processor_kwargs.py
+++ b/tests/multimodal/test_processor_kwargs.py
@@ -1,3 +1,5 @@
+# SPDX-License-Identifier: Apache-2.0
+
 from array import array
 from typing import Callable, Dict, Mapping, Optional
 from unittest.mock import patch
diff --git a/tests/multimodal/test_utils.py b/tests/multimodal/test_utils.py
index 198344e5bd88c..f9e0f507a1e86 100644
--- a/tests/multimodal/test_utils.py
+++ b/tests/multimodal/test_utils.py
@@ -1,3 +1,5 @@
+# SPDX-License-Identifier: Apache-2.0
+
 import base64
 import mimetypes
 import os
diff --git a/tests/multimodal/utils.py b/tests/multimodal/utils.py
index 29aeca605109b..9a336b7e60ffc 100644
--- a/tests/multimodal/utils.py
+++ b/tests/multimodal/utils.py
@@ -1,3 +1,5 @@
+# SPDX-License-Identifier: Apache-2.0
+
 import numpy as np
 from PIL import Image
 
diff --git a/tests/neuron/test_prefix_prefill.py b/tests/neuron/test_prefix_prefill.py
new file mode 100644
index 0000000000000..dfbcfc15e2327
--- /dev/null
+++ b/tests/neuron/test_prefix_prefill.py
@@ -0,0 +1,458 @@
+# SPDX-License-Identifier: Apache-2.0
+
+import random
+from typing import Optional
+
+import pytest
+import torch
+import torch.nn.functional as F
+
+
+class BlockDiagonalCausalFromBottomRightMask:
+
+    @staticmethod
+    def _from_seqlens(query_lens, seq_lens, block_size=None):
+        from torch import logical_and, logical_or
+
+        contexted = block_size is None
+        context_lens = torch.tensor(seq_lens) - torch.tensor(query_lens)
+        n_queries = sum(query_lens)
+        num_seqs = len(query_lens)
+        if contexted:
+            key_lens_blockaligned = seq_lens
+        else:
+            n_blocks_per_seq = (context_lens + block_size - 1) // block_size
+            offset_per_seq = n_blocks_per_seq * block_size
+            key_lens_blockaligned = offset_per_seq[:num_seqs].tolist()
+        n_keys = sum(key_lens_blockaligned)
+
+        a = (torch.arange(n_queries).reshape(n_queries,
+                                             1).expand(n_queries, n_keys))
+        b = torch.arange(n_keys).reshape(1, n_keys).expand(n_queries, n_keys)
+        q_cumsum = torch.tensor([0] + query_lens).cumsum(dim=0)
+        k_cumsum = torch.tensor([0] + key_lens_blockaligned).cumsum(dim=0)
+
+        prior_mask = torch.zeros(n_queries, n_keys)
+        new_masks: list[torch.Tensor] = []
+        for seq_id in range(num_seqs):
+            ri = q_cumsum[seq_id]
+            ci = k_cumsum[seq_id]
+            nr = query_lens[seq_id]
+
+            if contexted:
+                nc = seq_lens[seq_id]
+                a_offset = ci + nc - ri - nr
+                new_mask = (a + a_offset) >= b
+            else:
+                nc = context_lens[seq_id]
+                a_offset = ci + nc - 1
+                new_mask = a_offset >= b
+
+            left_mask = b >= ci
+            top_mask = a >= ri
+            bottom_mask = a < (ri + nr)
+
+            new_mask = logical_and(
+                logical_and(logical_and(new_mask, left_mask), top_mask),
+                bottom_mask,
+            )
+            prior_mask = logical_or(prior_mask, new_mask)
+            new_masks = new_masks + [new_mask]
+        return prior_mask
+
+    @staticmethod
+    def from_seqlens(query_lens, seq_lens, block_size=None):
+        contexted = block_size is None
+        if contexted:
+            prior_mask = BlockDiagonalCausalFromBottomRightMask._from_seqlens(
+                query_lens, seq_lens)
+            active_mask = None
+        else:
+            prior_mask = BlockDiagonalCausalFromBottomRightMask._from_seqlens(
+                query_lens, seq_lens, block_size)
+            active_mask = BlockDiagonalCausalFromBottomRightMask._from_seqlens(
+                query_lens, query_lens)
+        return prior_mask, active_mask
+
+
+def ref_softmax(x: torch.Tensor,
+                dim: int,
+                mixed_precision=False,
+                return_max_reduce=False):
+    max_value = torch.amax(x, dim=dim, keepdims=True)
+    exp = torch.exp(x - max_value)
+    if mixed_precision:
+        sum_value = torch.sum(exp.astype(torch.float32),
+                              dim=dim,
+                              keepdims=True).astype(x.dtype)
+    else:
+        sum_value = torch.sum(exp, dim=dim, keepdims=True)
+    if return_max_reduce:
+        return exp / sum_value, max_value, torch.reciprocal(sum_value)
+    return exp / sum_value
+
+
+def ref_masked_attention(
+    query: torch.Tensor,
+    key: torch.Tensor,
+    value: torch.Tensor,
+    scale: float,
+    attn_mask: Optional[torch.Tensor] = None,
+    return_max_reduce: Optional[bool] = False,
+) -> torch.Tensor:
+    scaled_qk = scale * torch.einsum("qhd,khd->hqk", query, key).float()
+    if attn_mask is not None:
+        masked_score = scaled_qk + attn_mask.float()
+    if return_max_reduce:
+        norm_score, cached_max, cached_sum_reciprocal = ref_softmax(
+            masked_score, dim=-1, return_max_reduce=True)
+    else:
+        norm_score = ref_softmax(masked_score, dim=-1)
+    out = torch.einsum("hqk,khd->qhd", norm_score, value)
+    if return_max_reduce:
+        return (
+            out,
+            cached_max,
+            cached_sum_reciprocal,
+            norm_score,
+            masked_score,
+            scaled_qk,
+        )
+    else:
+        return out
+
+
+def ref_context_attention(
+    query,
+    key,
+    value,
+    query_lens,
+    seq_lens,
+    head_size,
+    num_kv_heads,
+    num_heads,
+    num_queries_per_kv,
+    return_max_reduce=False,
+):
+    scale = float(1.0 / (head_size**0.5))
+    if num_queries_per_kv > 1:
+        # Handle MQA and GQA
+        key = torch.repeat_interleave(key, num_queries_per_kv, dim=1)
+        value = torch.repeat_interleave(value, num_queries_per_kv, dim=1)
+
+    attn_mask, _ = BlockDiagonalCausalFromBottomRightMask.from_seqlens(
+        query_lens, seq_lens)
+
+    # convert binary mask to -inf values
+    attn_mask = torch.logical_not(attn_mask)
+    attn_mask = attn_mask.float() * -30000
+
+    output, cached_max, cached_sum_reciprocal, lse, masked_score, scaled_qk = (
+        ref_masked_attention(
+            query,
+            key,
+            value,
+            scale,
+            attn_mask,
+            return_max_reduce=return_max_reduce,
+        ))
+
+    output = output.unsqueeze(1)
+    if return_max_reduce:
+        return (
+            output,
+            cached_max,
+            cached_sum_reciprocal,
+            lse,
+            masked_score,
+            scaled_qk,
+        )
+    else:
+        return output
+
+
+@pytest.mark.parametrize(
+    "num_heads,num_queries_per_kv,head_size,mixed_precision",
+    [
+        (4, 2, 8, False),
+        (4, 2, 8, True),
+        (32, 8, 64, True),
+    ],
+)
+@torch.inference_mode()
+def test_contexted_kv_attention(
+    num_heads: int,
+    num_queries_per_kv: int,
+    head_size: int,
+    mixed_precision: bool,
+) -> None:
+    import os
+
+    import torch_xla.core.xla_model as xm
+
+    from vllm.attention.ops.nki_flash_attn import flash_attn_varlen_nkifunc
+
+    device = xm.xla_device()
+
+    os.environ["NEURON_CC_FLAGS"] = (
+        " --model-type=transformer -O1 "
+        " --internal-hlo2tensorizer-options='--verify-hlo' ")
+
+    random.seed(0)
+    torch.manual_seed(0)
+    torch.set_printoptions(sci_mode=False)
+
+    min_ctx_len = 2
+    max_ctx_len = 64
+    min_query_len = 2
+    max_query_len = 64
+    prefill_batch_size = 2
+    decode_batch_size = 6
+    batch_size = prefill_batch_size + decode_batch_size
+    block_size = 32
+    max_model_len = (max_query_len + max_ctx_len) * 4
+
+    max_block_per_request = max_model_len // block_size
+    dtype = torch.float32
+    cache_size = (batch_size * max_block_per_request) + 2
+    ctx_lens = [
+        random.randint(min_ctx_len, max_ctx_len)
+        for _ in range(prefill_batch_size)
+    ] + [
+        random.randint(min_ctx_len, max_ctx_len)
+        for _ in range(decode_batch_size)
+    ]
+    query_lens = [
+        random.randint(min_query_len, max_query_len)
+        for _ in range(prefill_batch_size)
+    ] + [1 for _ in range(decode_batch_size)]
+    seq_lens = [a + b for a, b in zip(query_lens, ctx_lens)]
+    num_kv_heads = num_heads // num_queries_per_kv
+
+    num_tokens = sum(query_lens)
+    query = torch.empty(num_tokens, num_heads, head_size, dtype=dtype)
+    query.uniform_(-1, 1)
+    torch.empty(num_tokens, num_heads, head_size, dtype=dtype)
+
+    kv = torch.empty(sum(seq_lens), 2, num_kv_heads, head_size, dtype=dtype)
+    kv.uniform_(-1, 1)
+    key, value = kv.unbind(dim=1)
+
+    k_cache = torch.zeros(cache_size,
+                          block_size,
+                          num_kv_heads,
+                          head_size,
+                          dtype=dtype)
+    v_cache = torch.zeros(cache_size,
+                          block_size,
+                          num_kv_heads,
+                          head_size,
+                          dtype=dtype)
+    k = torch.zeros(sum(query_lens), num_kv_heads, head_size, dtype=dtype)
+    v = torch.zeros(sum(query_lens), num_kv_heads, head_size, dtype=dtype)
+    values = torch.arange(0, cache_size, dtype=torch.long)
+    values = values[torch.randperm(cache_size)]
+    block_table = values[:batch_size * max_block_per_request].view(
+        batch_size, max_block_per_request)
+    torch.tensor(seq_lens, dtype=torch.long)
+    b_ctx_len = torch.tensor(ctx_lens, dtype=torch.long)
+    b_start_loc = torch.cumsum(torch.tensor([0] + query_lens[:-1],
+                                            dtype=torch.long),
+                               dim=0)
+    # copy kv to cache
+    b_seq_start_loc = torch.cumsum(torch.tensor([0] + seq_lens[:-1],
+                                                dtype=torch.long),
+                                   dim=0)
+    for i in range(batch_size):
+        for j in range(query_lens[i]):
+            k[b_start_loc[i] + j].copy_(key[b_seq_start_loc[i] + b_ctx_len[i] +
+                                            j])
+            v[b_start_loc[i] + j].copy_(value[b_seq_start_loc[i] +
+                                              b_ctx_len[i] + j])
+        cur_ctx = 0
+        block_id = 0
+        while cur_ctx < b_ctx_len[i]:
+            start_loc = b_seq_start_loc[i] + cur_ctx
+            if cur_ctx + block_size > b_ctx_len[i]:
+                end_loc = b_seq_start_loc[i] + b_ctx_len[i]
+            else:
+                end_loc = start_loc + block_size
+            start_slot = block_table[i, block_id] * block_size
+            end_slot = start_slot + end_loc - start_loc
+            k_cache.view(-1, num_kv_heads,
+                         head_size)[start_slot:end_slot].copy_(
+                             key[start_loc:end_loc])
+            v_cache.view(-1, num_kv_heads,
+                         head_size)[start_slot:end_slot].copy_(
+                             value[start_loc:end_loc])
+            cur_ctx += block_size
+            block_id += 1
+
+    (
+        output_ref,
+        cached_max,
+        cached_sum_reciprocal,
+        lse,
+        masked_score,
+        scaled_qk,
+    ) = ref_context_attention(
+        query,
+        key,
+        value,
+        query_lens,
+        seq_lens,
+        head_size,
+        num_kv_heads,
+        num_heads,
+        num_queries_per_kv,
+        return_max_reduce=True,
+    )
+
+    # build neuron program
+    return_debug_tensors = False
+    B_P_SIZE = 128
+    LARGE_TILE_SZ = 2048
+    max_num_queries = (
+        (sum(query_lens) + block_size - 1) // block_size) * block_size
+
+    def get_active_block_tables(block_tables, query_lens, seq_lens, block_size,
+                                num_blocks):
+        context_lens = seq_lens - query_lens
+        blocks_per_seq = (context_lens + block_size - 1) // block_size
+        num_seqs = len(seq_lens)
+        active_blocks: list[int] = []
+        for seq_id in range(num_seqs):
+            active_blocks = (
+                active_blocks +
+                block_tables[seq_id, :blocks_per_seq[seq_id]].tolist())
+        return F.pad(
+            torch.tensor(active_blocks),
+            (0, num_blocks - len(active_blocks)),
+            "constant",
+            0,
+        )
+
+    def shift_bit_length(x):
+        return 1 << (x - 1).bit_length()
+
+    # calculate input shapes
+    max_num_queries_shifted = shift_bit_length(max_num_queries)
+    max_num_queries_factor = B_P_SIZE // max_num_queries_shifted
+    max_num_queries_padded = max_num_queries_shifted * max_num_queries_factor
+    assert (max_num_queries_padded == B_P_SIZE
+            ), "invalid {max_num_queries_padded=}"
+    head_size_padded = B_P_SIZE
+    context_lens = torch.tensor(seq_lens) - torch.tensor(query_lens)
+    num_active_blocks_shifted = shift_bit_length(
+        ((context_lens + block_size - 1) // block_size).sum().item())
+    num_active_blocks_factor = (LARGE_TILE_SZ // block_size //
+                                num_active_blocks_shifted)
+    num_active_blocks = num_active_blocks_shifted * num_active_blocks_factor
+    assert (num_active_blocks *
+            block_size) == LARGE_TILE_SZ, "invalid {num_active_blocks=}"
+    context_kv_len = num_active_blocks * block_size
+    assert context_kv_len == LARGE_TILE_SZ, f"invalid {context_kv_len=}"
+
+    # pad QKV tensors
+    pad_dims = (
+        0,
+        head_size_padded - query.shape[2],
+        0,
+        0,
+        0,
+        max_num_queries_padded - query.shape[0],
+    )
+    query = F.pad(query, pad_dims, "constant", 0)
+    k = F.pad(k, pad_dims, "constant", 0)
+    v = F.pad(v, pad_dims, "constant", 0)
+    k_cache = F.pad(k_cache, (0, head_size_padded - head_size), "constant", 0)
+    v_cache = F.pad(v_cache, (0, head_size_padded - head_size), "constant", 0)
+
+    # permute QKV tensors
+    # query: (1, n_heads, d, seq_q)
+    # key:   (1, n_kv_heads, d, seq_k)
+    # value: (1, n_kv_heads, seq_v, d)
+    query = query.unsqueeze(0).permute(0, 2, 3, 1).contiguous()
+    k = k.unsqueeze(0).permute(0, 2, 3, 1).contiguous()
+    v = v.unsqueeze(0).permute(0, 2, 1, 3).contiguous()
+
+    # transform block table
+    active_block_table = get_active_block_tables(
+        block_table,
+        torch.tensor(query_lens),
+        torch.tensor(seq_lens),
+        block_size,
+        num_active_blocks,
+    )
+
+    # Build attention masks
+    prior_mask, active_mask = (
+        BlockDiagonalCausalFromBottomRightMask.from_seqlens(
+            query_lens, seq_lens, block_size=block_size))
+    attn_mask = torch.concat(
+        [
+            F.pad(
+                prior_mask,
+                (
+                    0,
+                    context_kv_len - prior_mask.shape[1],
+                    0,
+                    B_P_SIZE - prior_mask.shape[0],
+                ),
+                "constant",
+                0,
+            ).bool(),
+            F.pad(
+                active_mask,
+                (
+                    0,
+                    B_P_SIZE - active_mask.shape[1],
+                    0,
+                    B_P_SIZE - active_mask.shape[0],
+                ),
+                "constant",
+                0,
+            ).bool(),
+        ],
+        dim=1,
+    )
+
+    input_args = (
+        query.to(device=device),
+        k.to(device=device),
+        v.to(device=device),
+        k_cache.to(device=device),
+        v_cache.to(device=device),
+        active_block_table.to(torch.int32).to(device=device),
+        attn_mask.to(device=device),
+    )
+    input_kwargs = dict(
+        n_kv_head=num_kv_heads,
+        head_size=head_size,
+        mixed_precision=mixed_precision,
+    )
+
+    if return_debug_tensors:
+        output_nki, *debug_tensors = flash_attn_varlen_nkifunc(
+            *input_args, **input_kwargs)
+    else:
+        output_nki = flash_attn_varlen_nkifunc(*input_args, **input_kwargs)
+        debug_tensors = []
+
+    output_nki = torch.tensor(output_nki).cpu()
+    debug_tensors = [torch.tensor(dt).cpu() for dt in debug_tensors]
+
+    num_actual_tokens = sum(query_lens)
+    print(f"{num_actual_tokens=}")
+    # - o: shape (bs, n_heads, seq_q, d) -> (bs, seq_q, n_heads, d)
+    output_nki = output_nki.permute(
+        0, 2, 1, 3)[:, :, :, :head_size].cpu()[0, :num_actual_tokens, :, :]
+    output_ref_padded = F.pad(
+        output_ref,
+        (0, 0, 0, 0, 0, 0, 0, max_num_queries_padded - output_ref.shape[0]),
+        "constant",
+        0,
+    )
+    output_ref = output_ref_padded.transpose(0, 1)[0, :num_actual_tokens, :, :]
+
+    torch.testing.assert_close(output_nki, output_ref, atol=1e-2, rtol=0)
diff --git a/tests/plugins/vllm_add_dummy_model/setup.py b/tests/plugins/vllm_add_dummy_model/setup.py
index 9b535127f1df1..e3fb6efb27576 100644
--- a/tests/plugins/vllm_add_dummy_model/setup.py
+++ b/tests/plugins/vllm_add_dummy_model/setup.py
@@ -1,3 +1,5 @@
+# SPDX-License-Identifier: Apache-2.0
+
 from setuptools import setup
 
 setup(name='vllm_add_dummy_model',
diff --git a/tests/plugins/vllm_add_dummy_model/vllm_add_dummy_model/__init__.py b/tests/plugins/vllm_add_dummy_model/vllm_add_dummy_model/__init__.py
index 62a8f871fa51b..0c431cb39737b 100644
--- a/tests/plugins/vllm_add_dummy_model/vllm_add_dummy_model/__init__.py
+++ b/tests/plugins/vllm_add_dummy_model/vllm_add_dummy_model/__init__.py
@@ -1,3 +1,5 @@
+# SPDX-License-Identifier: Apache-2.0
+
 from vllm import ModelRegistry
 
 
diff --git a/tests/plugins/vllm_add_dummy_model/vllm_add_dummy_model/my_gemma_embedding.py b/tests/plugins/vllm_add_dummy_model/vllm_add_dummy_model/my_gemma_embedding.py
index 5e7d7d1877e61..3af62b2885e57 100644
--- a/tests/plugins/vllm_add_dummy_model/vllm_add_dummy_model/my_gemma_embedding.py
+++ b/tests/plugins/vllm_add_dummy_model/vllm_add_dummy_model/my_gemma_embedding.py
@@ -1,3 +1,5 @@
+# SPDX-License-Identifier: Apache-2.0
+
 from typing import Iterable, List, Optional, Tuple, Union
 
 import torch
diff --git a/tests/plugins/vllm_add_dummy_model/vllm_add_dummy_model/my_llava.py b/tests/plugins/vllm_add_dummy_model/vllm_add_dummy_model/my_llava.py
index ac64edfd4ec9d..c23ab64308f20 100644
--- a/tests/plugins/vllm_add_dummy_model/vllm_add_dummy_model/my_llava.py
+++ b/tests/plugins/vllm_add_dummy_model/vllm_add_dummy_model/my_llava.py
@@ -1,3 +1,5 @@
+# SPDX-License-Identifier: Apache-2.0
+
 from typing import Optional
 
 import torch
diff --git a/tests/plugins/vllm_add_dummy_model/vllm_add_dummy_model/my_opt.py b/tests/plugins/vllm_add_dummy_model/vllm_add_dummy_model/my_opt.py
index 569ef216c9f0a..bbd11ed4aac9d 100644
--- a/tests/plugins/vllm_add_dummy_model/vllm_add_dummy_model/my_opt.py
+++ b/tests/plugins/vllm_add_dummy_model/vllm_add_dummy_model/my_opt.py
@@ -1,3 +1,5 @@
+# SPDX-License-Identifier: Apache-2.0
+
 from typing import Optional
 
 import torch
diff --git a/tests/plugins/vllm_add_dummy_platform/setup.py b/tests/plugins/vllm_add_dummy_platform/setup.py
index 31639906898db..10df0b5e05035 100644
--- a/tests/plugins/vllm_add_dummy_platform/setup.py
+++ b/tests/plugins/vllm_add_dummy_platform/setup.py
@@ -1,3 +1,5 @@
+# SPDX-License-Identifier: Apache-2.0
+
 from setuptools import setup
 
 setup(
diff --git a/tests/plugins/vllm_add_dummy_platform/vllm_add_dummy_platform/__init__.py b/tests/plugins/vllm_add_dummy_platform/vllm_add_dummy_platform/__init__.py
index 594cef520a7de..0d1b062ac2eb5 100644
--- a/tests/plugins/vllm_add_dummy_platform/vllm_add_dummy_platform/__init__.py
+++ b/tests/plugins/vllm_add_dummy_platform/vllm_add_dummy_platform/__init__.py
@@ -1,3 +1,5 @@
+# SPDX-License-Identifier: Apache-2.0
+
 from typing import Optional
 
 
diff --git a/tests/plugins/vllm_add_dummy_platform/vllm_add_dummy_platform/dummy_attention_backend.py b/tests/plugins/vllm_add_dummy_platform/vllm_add_dummy_platform/dummy_attention_backend.py
index 5634be3c8d882..33425bbc11ed9 100644
--- a/tests/plugins/vllm_add_dummy_platform/vllm_add_dummy_platform/dummy_attention_backend.py
+++ b/tests/plugins/vllm_add_dummy_platform/vllm_add_dummy_platform/dummy_attention_backend.py
@@ -1,3 +1,5 @@
+# SPDX-License-Identifier: Apache-2.0
+
 from vllm.attention.backends.flash_attn import FlashAttentionBackend
 
 
diff --git a/tests/plugins/vllm_add_dummy_platform/vllm_add_dummy_platform/dummy_platform.py b/tests/plugins/vllm_add_dummy_platform/vllm_add_dummy_platform/dummy_platform.py
index 84721d5971ccf..5cefafc7e06c7 100644
--- a/tests/plugins/vllm_add_dummy_platform/vllm_add_dummy_platform/dummy_platform.py
+++ b/tests/plugins/vllm_add_dummy_platform/vllm_add_dummy_platform/dummy_platform.py
@@ -1,3 +1,5 @@
+# SPDX-License-Identifier: Apache-2.0
+
 from vllm.platforms.cuda import CudaPlatform
 
 
@@ -5,5 +7,5 @@ class DummyPlatform(CudaPlatform):
     device_name = "DummyDevice"
 
     def get_attn_backend_cls(self, backend_name, head_size, dtype,
-                             kv_cache_dtype, block_size, use_v1):
+                             kv_cache_dtype, block_size, use_v1, use_mla):
         return "vllm_add_dummy_platform.dummy_attention_backend.DummyAttentionBackend"  # noqa E501
diff --git a/tests/plugins_tests/test_platform_plugins.py b/tests/plugins_tests/test_platform_plugins.py
index 661aa5f649ab9..ed50fe5350149 100644
--- a/tests/plugins_tests/test_platform_plugins.py
+++ b/tests/plugins_tests/test_platform_plugins.py
@@ -1,3 +1,5 @@
+# SPDX-License-Identifier: Apache-2.0
+
 import torch
 
 from tests.kernels.utils import override_backend_env_variable
diff --git a/tests/prefix_caching/test_disable_sliding_window.py b/tests/prefix_caching/test_disable_sliding_window.py
index 5a28943b7ecbc..19f393e07984c 100644
--- a/tests/prefix_caching/test_disable_sliding_window.py
+++ b/tests/prefix_caching/test_disable_sliding_window.py
@@ -1,3 +1,4 @@
+# SPDX-License-Identifier: Apache-2.0
 """Compare the with and without prefix caching.
 
 Run `pytest tests/prefix_caching/test_prefix_caching.py`.
diff --git a/tests/prefix_caching/test_prefix_caching.py b/tests/prefix_caching/test_prefix_caching.py
index 8d16710f14585..90d424fe35d8f 100644
--- a/tests/prefix_caching/test_prefix_caching.py
+++ b/tests/prefix_caching/test_prefix_caching.py
@@ -1,3 +1,4 @@
+# SPDX-License-Identifier: Apache-2.0
 """Compare the with and without prefix caching.
 
 Run `pytest tests/prefix_caching/test_prefix_caching.py`.
diff --git a/tests/prompt_adapter/test_bloom.py b/tests/prompt_adapter/test_bloom.py
index 6528b3009b8c0..a31d8e873d798 100644
--- a/tests/prompt_adapter/test_bloom.py
+++ b/tests/prompt_adapter/test_bloom.py
@@ -1,3 +1,5 @@
+# SPDX-License-Identifier: Apache-2.0
+
 import pytest
 
 import vllm
diff --git a/tests/prompt_adapter/test_multi_adapter_inference.py b/tests/prompt_adapter/test_multi_adapter_inference.py
index 39a79becdfbb3..e249a6e64427a 100644
--- a/tests/prompt_adapter/test_multi_adapter_inference.py
+++ b/tests/prompt_adapter/test_multi_adapter_inference.py
@@ -1,3 +1,5 @@
+# SPDX-License-Identifier: Apache-2.0
+
 from vllm import EngineArgs, LLMEngine, SamplingParams
 from vllm.prompt_adapter.request import PromptAdapterRequest
 
diff --git a/tests/prompt_adapter/test_pa_lora.py b/tests/prompt_adapter/test_pa_lora.py
index 2a5f23f7f92ec..fb4c3e1497652 100644
--- a/tests/prompt_adapter/test_pa_lora.py
+++ b/tests/prompt_adapter/test_pa_lora.py
@@ -1,3 +1,5 @@
+# SPDX-License-Identifier: Apache-2.0
+
 from huggingface_hub import snapshot_download
 
 from vllm import EngineArgs, LLMEngine, SamplingParams
diff --git a/tests/quantization/test_bitsandbytes.py b/tests/quantization/test_bitsandbytes.py
index 569fc8dfb6a21..4b5210cdf074f 100644
--- a/tests/quantization/test_bitsandbytes.py
+++ b/tests/quantization/test_bitsandbytes.py
@@ -1,3 +1,4 @@
+# SPDX-License-Identifier: Apache-2.0
 '''Tests whether bitsandbytes computation is enabled correctly.
 
 Run `pytest tests/quantization/test_bitsandbytes.py`.
diff --git a/tests/quantization/test_compressed_tensors.py b/tests/quantization/test_compressed_tensors.py
index 0cd86cef0a475..7e2e6f6ed5890 100644
--- a/tests/quantization/test_compressed_tensors.py
+++ b/tests/quantization/test_compressed_tensors.py
@@ -1,3 +1,4 @@
+# SPDX-License-Identifier: Apache-2.0
 """Test model set-up and weight loading for llmcompressor-quantized models.
 
 Run `pytest tests/quantization/test_compressed_tensors.py`.
@@ -313,8 +314,10 @@ def test_compressed_tensors_2of4_quant_int8(vllm_runner, args_2of4):
         assert output
 
 
-@pytest.mark.skipif(not sparse_cutlass_supported(),
-                    reason="Sparse FP8 is not yet supported on this GPU type.")
+@pytest.mark.skip(reason="2of4 sparse w16a16 CUTLASS produces bad output.")
+@pytest.mark.skipif(
+    not sparse_cutlass_supported(),
+    reason="2of4 Sparse is not yet supported on this GPU type.")
 @pytest.mark.parametrize(
     "args_2of4",
     [("nm-testing/TinyLlama-1.1B-Chat-v1.0-2of4-Sparse-Dense-Compressor")])
diff --git a/tests/quantization/test_configs.py b/tests/quantization/test_configs.py
index cf77ccec7a191..0abbd8ebb5980 100644
--- a/tests/quantization/test_configs.py
+++ b/tests/quantization/test_configs.py
@@ -1,3 +1,4 @@
+# SPDX-License-Identifier: Apache-2.0
 """Tests whether Marlin models can be loaded from the autogptq config.
 
 Run `pytest tests/quantization/test_configs.py --forked`.
diff --git a/tests/quantization/test_cpu_offload.py b/tests/quantization/test_cpu_offload.py
index 21ce5174c641e..29a5721ef3644 100644
--- a/tests/quantization/test_cpu_offload.py
+++ b/tests/quantization/test_cpu_offload.py
@@ -1,3 +1,5 @@
+# SPDX-License-Identifier: Apache-2.0
+
 # Expanded quantized model tests for CPU offloading
 # Base tests: tests/basic_correctness/test_cpu_offload.py
 
diff --git a/tests/quantization/test_experts_int8.py b/tests/quantization/test_experts_int8.py
index ec31c94efa07f..b6db6d5f2fdc5 100644
--- a/tests/quantization/test_experts_int8.py
+++ b/tests/quantization/test_experts_int8.py
@@ -1,3 +1,5 @@
+# SPDX-License-Identifier: Apache-2.0
+
 # flake8: noqa
 """Tests experts_int8 quantization startup and generation, 
 doesn't test correctness
diff --git a/tests/quantization/test_fp8.py b/tests/quantization/test_fp8.py
index 4bff734746297..5616935ebdc0c 100644
--- a/tests/quantization/test_fp8.py
+++ b/tests/quantization/test_fp8.py
@@ -1,3 +1,4 @@
+# SPDX-License-Identifier: Apache-2.0
 """Tests whether FP8 computation is enabled correctly.
 
 Run `pytest tests/quantization/test_fp8.py --forked`.
diff --git a/tests/quantization/test_ipex_quant.py b/tests/quantization/test_ipex_quant.py
index 68a73f0f8ab48..0e3913676f5f7 100644
--- a/tests/quantization/test_ipex_quant.py
+++ b/tests/quantization/test_ipex_quant.py
@@ -1,3 +1,4 @@
+# SPDX-License-Identifier: Apache-2.0
 """Test model set-up and inference for quantized HF models supported
  on the CPU/GPU backend using IPEX (including AWQ/GPTQ).
  
diff --git a/tests/quantization/test_lm_head.py b/tests/quantization/test_lm_head.py
index fa2d9645ea47f..ec60d8a57559d 100644
--- a/tests/quantization/test_lm_head.py
+++ b/tests/quantization/test_lm_head.py
@@ -1,3 +1,4 @@
+# SPDX-License-Identifier: Apache-2.0
 """Tests whether gptq models with quantized lm_head can be loaded.
 
 Run `pytest tests/quantization/test_quant_lm_head_true.py --forked`.
diff --git a/tests/quantization/test_quark.py b/tests/quantization/test_quark.py
index 11382ad708faa..491370c7cc24d 100644
--- a/tests/quantization/test_quark.py
+++ b/tests/quantization/test_quark.py
@@ -1,3 +1,4 @@
+# SPDX-License-Identifier: Apache-2.0
 """Test model set-up and weight loading for quark-quantized models.
 
 Run `pytest tests/quantization/test_quark.py`.
diff --git a/tests/quantization/test_register_quantization_config.py b/tests/quantization/test_register_quantization_config.py
index 8e7f44a399ddf..9e1867f913e98 100644
--- a/tests/quantization/test_register_quantization_config.py
+++ b/tests/quantization/test_register_quantization_config.py
@@ -1,3 +1,4 @@
+# SPDX-License-Identifier: Apache-2.0
 """Tests register custom quantization config.
 
 See https://github.com/vllm-project/vllm/issues/11926 for more details.
diff --git a/tests/quantization/utils.py b/tests/quantization/utils.py
index 8ebd8dd2be0d5..7a339c162cc48 100644
--- a/tests/quantization/utils.py
+++ b/tests/quantization/utils.py
@@ -1,3 +1,5 @@
+# SPDX-License-Identifier: Apache-2.0
+
 from vllm.model_executor.layers.quantization import get_quantization_config
 from vllm.platforms import current_platform
 
diff --git a/tests/runai_model_streamer/test_runai_model_streamer_loader.py b/tests/runai_model_streamer/test_runai_model_streamer_loader.py
index c5722fbae5c8a..aa91fa8e1c1c6 100644
--- a/tests/runai_model_streamer/test_runai_model_streamer_loader.py
+++ b/tests/runai_model_streamer/test_runai_model_streamer_loader.py
@@ -1,3 +1,5 @@
+# SPDX-License-Identifier: Apache-2.0
+
 from vllm import SamplingParams
 from vllm.config import LoadConfig, LoadFormat
 from vllm.model_executor.model_loader.loader import (RunaiModelStreamerLoader,
diff --git a/tests/runai_model_streamer/test_weight_utils.py b/tests/runai_model_streamer/test_weight_utils.py
index 5c89bd78ad81d..4afa76c51693f 100644
--- a/tests/runai_model_streamer/test_weight_utils.py
+++ b/tests/runai_model_streamer/test_weight_utils.py
@@ -1,3 +1,5 @@
+# SPDX-License-Identifier: Apache-2.0
+
 import glob
 import tempfile
 
diff --git a/tests/samplers/test_beam_search.py b/tests/samplers/test_beam_search.py
index 4d1a6978d4c55..39feb1895b094 100644
--- a/tests/samplers/test_beam_search.py
+++ b/tests/samplers/test_beam_search.py
@@ -1,3 +1,4 @@
+# SPDX-License-Identifier: Apache-2.0
 """Compare the outputs of HF and vLLM when using beam search.
 
 Run `pytest tests/samplers/test_beam_search.py`.
diff --git a/tests/samplers/test_ignore_eos.py b/tests/samplers/test_ignore_eos.py
index dc2482d85a91f..7f26698c927c3 100644
--- a/tests/samplers/test_ignore_eos.py
+++ b/tests/samplers/test_ignore_eos.py
@@ -1,3 +1,4 @@
+# SPDX-License-Identifier: Apache-2.0
 """Make sure ignore_eos works.
 
 Run `pytest tests/samplers/test_ignore_eos.py`.
diff --git a/tests/samplers/test_logits_processor.py b/tests/samplers/test_logits_processor.py
index 2979470120710..3b95b038979f3 100644
--- a/tests/samplers/test_logits_processor.py
+++ b/tests/samplers/test_logits_processor.py
@@ -1,3 +1,5 @@
+# SPDX-License-Identifier: Apache-2.0
+
 import pytest
 import torch
 
diff --git a/tests/samplers/test_logprobs.py b/tests/samplers/test_logprobs.py
index c07c71e38233f..59d36099c650b 100644
--- a/tests/samplers/test_logprobs.py
+++ b/tests/samplers/test_logprobs.py
@@ -1,3 +1,5 @@
+# SPDX-License-Identifier: Apache-2.0
+
 from typing import List
 
 import pytest
diff --git a/tests/samplers/test_no_bad_words.py b/tests/samplers/test_no_bad_words.py
index 4190cf7cd7664..cc6557694c6c1 100644
--- a/tests/samplers/test_no_bad_words.py
+++ b/tests/samplers/test_no_bad_words.py
@@ -1,3 +1,4 @@
+# SPDX-License-Identifier: Apache-2.0
 """Make sure bad_words works.
 
 Run `pytest tests/samplers/test_no_bad_words.py`.
diff --git a/tests/samplers/test_ranks.py b/tests/samplers/test_ranks.py
index ed2fee1ae252e..c74c1c02c2470 100644
--- a/tests/samplers/test_ranks.py
+++ b/tests/samplers/test_ranks.py
@@ -1,3 +1,5 @@
+# SPDX-License-Identifier: Apache-2.0
+
 import pytest
 
 from vllm import SamplingParams
diff --git a/tests/samplers/test_rejection_sampler.py b/tests/samplers/test_rejection_sampler.py
index 397fa2cc85821..cc199bf682fc0 100644
--- a/tests/samplers/test_rejection_sampler.py
+++ b/tests/samplers/test_rejection_sampler.py
@@ -1,3 +1,4 @@
+# SPDX-License-Identifier: Apache-2.0
 """Tests for rejection sampling."""
 from typing import List, Tuple
 
@@ -23,16 +24,17 @@ def mock_causal_accepted_tensor(
     """
     batch_size = last_accepted_indices.shape[0]
 
-    accepted = (torch.arange(k).expand(batch_size, k) <=
-                last_accepted_indices.unsqueeze(-1).broadcast_to(
+    accepted = (torch.arange(k).expand(batch_size, k)
+                <= last_accepted_indices.unsqueeze(-1).broadcast_to(
                     batch_size, k))
 
     # Sprinkle accepted values after the contiguous initial accepted values.
     # This replicates the behavior of rejection sampling, which may "accept"
     # a token that cannot be accepted because of causality.
-    sprinkle_candidates = (
-        torch.arange(k).expand(batch_size, k) >
-        last_accepted_indices.unsqueeze(-1).broadcast_to(batch_size, k) + 1)
+    sprinkle_candidates = (torch.arange(k).expand(
+        batch_size,
+        k) > last_accepted_indices.unsqueeze(-1).broadcast_to(batch_size, k) +
+                           1)
     sprinkle = torch.rand(batch_size, k) > 0.5
     accepted[sprinkle_candidates] = sprinkle[sprinkle_candidates]
     return accepted
@@ -445,8 +447,8 @@ def test_rejection_sampling_approximates_target_distribution(
         distance_wrt_reference)
 
     expected_improvement_multiplier = 20
-    assert (relative_change_in_distance_wrt_target >
-            relative_change_in_distance_wrt_reference *
+    assert (relative_change_in_distance_wrt_target
+            > relative_change_in_distance_wrt_reference *
             expected_improvement_multiplier)
 
 
diff --git a/tests/samplers/test_sampler.py b/tests/samplers/test_sampler.py
index 28c34064f670c..ca09e536a06ca 100644
--- a/tests/samplers/test_sampler.py
+++ b/tests/samplers/test_sampler.py
@@ -1,3 +1,5 @@
+# SPDX-License-Identifier: Apache-2.0
+
 import itertools
 import random
 from dataclasses import dataclass
diff --git a/tests/samplers/test_seeded_generate.py b/tests/samplers/test_seeded_generate.py
index bf1ee6c397838..4e828256130e9 100644
--- a/tests/samplers/test_seeded_generate.py
+++ b/tests/samplers/test_seeded_generate.py
@@ -1,3 +1,4 @@
+# SPDX-License-Identifier: Apache-2.0
 """Verify that seeded random sampling is deterministic.
 
 Run `pytest tests/samplers/test_seeded_generate.py`.
diff --git a/tests/samplers/test_typical_acceptance_sampler.py b/tests/samplers/test_typical_acceptance_sampler.py
index 4ddad66dce1fb..ecf98179ca21a 100644
--- a/tests/samplers/test_typical_acceptance_sampler.py
+++ b/tests/samplers/test_typical_acceptance_sampler.py
@@ -1,3 +1,4 @@
+# SPDX-License-Identifier: Apache-2.0
 """Tests for rejection sampling."""
 
 import pytest
diff --git a/tests/spec_decode/e2e/conftest.py b/tests/spec_decode/e2e/conftest.py
index b9cb3858c0068..53c888816a6c1 100644
--- a/tests/spec_decode/e2e/conftest.py
+++ b/tests/spec_decode/e2e/conftest.py
@@ -1,7 +1,10 @@
+# SPDX-License-Identifier: Apache-2.0
+
 from itertools import cycle
 from typing import List, Optional, Sequence, Tuple, Union
 
 import pytest
+import torch
 
 from vllm import LLM, SamplingParams
 from vllm.distributed import cleanup_dist_env_and_memory
@@ -154,6 +157,8 @@ def _check_logprobs_when_output_disabled(
          spec_pos_logprob) = next(iter(spec_pos_logprobs.items()))
         assert spec_pos_logprob.rank == -1
         assert spec_pos_logprob.logprob == 0.0
+        if isinstance(spec_pos_logprob_token_id, torch.Tensor):
+            spec_pos_logprob_token_id = spec_pos_logprob_token_id.item()
         assert spec_pos_logprob_token_id in baseline_pos_logprobs
 
 
@@ -244,7 +249,8 @@ def run_equality_correctness_test_tp(model,
                                      batch_size: int,
                                      max_output_len: int,
                                      seed: int = 0,
-                                     temperature: float = 0.0):
+                                     temperature: float = 0.0,
+                                     logprobs: Optional[int] = None):
     """Helper method that compares the outputs of both the baseline LLM and
     the test LLM. It asserts greedy equality, e.g. that the outputs are exactly
     the same when temperature is zero.
@@ -257,7 +263,6 @@ def run_equality_correctness_test_tp(model,
     results = []
 
     prompts = [prompt for prompt, _ in zip(cycle(PROMPTS), range(batch_size))]
-
     for args, env in ((arg1, env1), (arg2, env2)):
         with RemoteOpenAIServer(model,
                                 args,
@@ -269,12 +274,14 @@ def run_equality_correctness_test_tp(model,
                                                    prompt=prompts,
                                                    max_tokens=max_output_len,
                                                    seed=seed,
-                                                   temperature=temperature)
+                                                   temperature=temperature,
+                                                   logprobs=logprobs)
 
             results.append({
                 "test":
                 "seeded_sampling",
                 "text": [choice.text for choice in completion.choices],
+                "logprobs": [choice.logprobs for choice in completion.choices],
                 "finish_reason":
                 [choice.finish_reason for choice in completion.choices],
                 "usage":
@@ -284,7 +291,15 @@ def run_equality_correctness_test_tp(model,
     n = len(results) // 2
     arg1_results = results[:n]
     arg2_results = results[n:]
+    # Separate logprobs to avoid asserting exact equality.
+    arg1_logprobs = [r.pop("logprobs") for r in arg1_results]
+    arg2_logprobs = [r.pop("logprobs") for r in arg2_results]
+
     for arg1_result, arg2_result in zip(arg1_results, arg2_results):
         assert arg1_result == arg2_result, (
             f"Results for {model=} are not the same with {arg1=} and {arg2=}. "
             f"{arg1_result=} != {arg2_result=}")
+    if logprobs:
+        for logs1, logs2 in zip(arg1_logprobs, arg2_logprobs):
+            for l1, l2 in zip(logs1, logs2):
+                assert l1.tokens == l2.tokens
diff --git a/tests/spec_decode/e2e/test_compatibility.py b/tests/spec_decode/e2e/test_compatibility.py
index af8397c235f48..14a0ebf1d634f 100644
--- a/tests/spec_decode/e2e/test_compatibility.py
+++ b/tests/spec_decode/e2e/test_compatibility.py
@@ -1,3 +1,5 @@
+# SPDX-License-Identifier: Apache-2.0
+
 import pytest
 
 from vllm import SamplingParams
diff --git a/tests/spec_decode/e2e/test_eagle_correctness.py b/tests/spec_decode/e2e/test_eagle_correctness.py
index 5bc70de9dac56..6d1803f8bc632 100644
--- a/tests/spec_decode/e2e/test_eagle_correctness.py
+++ b/tests/spec_decode/e2e/test_eagle_correctness.py
@@ -1,3 +1,4 @@
+# SPDX-License-Identifier: Apache-2.0
 """This docstring details important information on the testing methodology.
 
 Most of the tests rely on "greedy equality", where we expect the output of
diff --git a/tests/spec_decode/e2e/test_integration.py b/tests/spec_decode/e2e/test_integration.py
index b89e5849727f4..c67fa85146c6b 100644
--- a/tests/spec_decode/e2e/test_integration.py
+++ b/tests/spec_decode/e2e/test_integration.py
@@ -1,3 +1,4 @@
+# SPDX-License-Identifier: Apache-2.0
 """Tests which cover integration of the speculative decoding framework with
 other features, e.g. cuda graphs.
 """
diff --git a/tests/spec_decode/e2e/test_integration_dist_tp2.py b/tests/spec_decode/e2e/test_integration_dist_tp2.py
index 02cba92795142..e5a542b6d84ce 100644
--- a/tests/spec_decode/e2e/test_integration_dist_tp2.py
+++ b/tests/spec_decode/e2e/test_integration_dist_tp2.py
@@ -1,7 +1,10 @@
+# SPDX-License-Identifier: Apache-2.0
 """Tests which cover integration of the speculative decoding framework with
 tensor parallelism.
 """
 
+from typing import Optional
+
 import pytest
 import torch
 
@@ -154,15 +157,20 @@ def test_draft_model_tp_lt_target_model_tp2(model, common_llm_kwargs,
                               "--speculative-draft-tensor-parallel-size",
                               "1",
                           ])])
+@pytest.mark.parametrize("logprobs", [None, 2])
 @pytest.mark.parametrize("batch_size", [2])
 @pytest.mark.parametrize("seed", [1])
 def test_spec_decode_chunked_prefill_tp2(model, common_llm_kwargs,
                                          per_test_common_llm_kwargs,
                                          baseline_llm_kwargs, test_llm_kwargs,
+                                         logprobs: Optional[int],
                                          batch_size: int, seed: int):
     """Verify spec decode works well with same and different TP size for
     the draft model with chunked prefill.
     """
+    if logprobs:
+        test_llm_kwargs.extend(
+            ["--disable_logprobs_during_spec_decoding", "False"])
     run_equality_correctness_test_tp(model,
                                      common_llm_kwargs,
                                      per_test_common_llm_kwargs,
@@ -171,4 +179,5 @@ def test_spec_decode_chunked_prefill_tp2(model, common_llm_kwargs,
                                      batch_size,
                                      max_output_len=32,
                                      seed=seed,
-                                     temperature=0.0)
+                                     temperature=0.0,
+                                     logprobs=logprobs)
diff --git a/tests/spec_decode/e2e/test_integration_dist_tp4.py b/tests/spec_decode/e2e/test_integration_dist_tp4.py
index 2cb10de1c6f55..cb9c46dc70713 100644
--- a/tests/spec_decode/e2e/test_integration_dist_tp4.py
+++ b/tests/spec_decode/e2e/test_integration_dist_tp4.py
@@ -1,3 +1,4 @@
+# SPDX-License-Identifier: Apache-2.0
 """Tests which cover integration of the speculative decoding framework with
 tensor parallelism.
 """
diff --git a/tests/spec_decode/e2e/test_logprobs.py b/tests/spec_decode/e2e/test_logprobs.py
index 4cfca8b78e79b..5991a8b02353a 100644
--- a/tests/spec_decode/e2e/test_logprobs.py
+++ b/tests/spec_decode/e2e/test_logprobs.py
@@ -1,29 +1,32 @@
+# SPDX-License-Identifier: Apache-2.0
+
 from itertools import cycle
 
 import pytest
 
 from vllm import SamplingParams
 
+from ..utils import maybe_enable_chunked_prefill
 from .conftest import run_equality_correctness_test
 
 
 @pytest.mark.parametrize(
     "common_llm_kwargs",
     [{
-        "model_name": "JackFram/llama-68m",
+        "model_name": "JackFram/llama-160m",
 
         # Skip cuda graph recording for fast test.
-        "enforce_eager": True,
+        "enforce_eager": True
     }])
 @pytest.mark.parametrize("per_test_common_llm_kwargs", [{}])
 @pytest.mark.parametrize("baseline_llm_kwargs", [{}])
 @pytest.mark.parametrize("test_llm_kwargs",
                          [{
-                             "speculative_model": "JackFram/llama-160m",
+                             "speculative_model": "JackFram/llama-68m",
                              "num_speculative_tokens": 3,
                              "disable_logprobs_during_spec_decoding": False,
                          }, {
-                             "speculative_model": "JackFram/llama-160m",
+                             "speculative_model": "JackFram/llama-68m",
                              "num_speculative_tokens": 3,
                              "disable_logprobs_during_spec_decoding": True,
                          }])
@@ -36,12 +39,15 @@ from .conftest import run_equality_correctness_test
     ])
 @pytest.mark.parametrize("seed", [1])
 @pytest.mark.parametrize("logprobs", [1, 6])
+@pytest.mark.parametrize("prefill_chunk_size", [-1, 4, 12])
 def test_logprobs_equality(vllm_runner, common_llm_kwargs,
                            per_test_common_llm_kwargs, baseline_llm_kwargs,
                            test_llm_kwargs, batch_size: int, output_len: int,
-                           seed: int, logprobs: int):
-    """Verify output logprobs are equal with and without speculative decoding.
+                           seed: int, logprobs: int, prefill_chunk_size: int):
+    """Verify output logprobs are equal with and without speculative decoding,
+        as well as with and without chunked prefill.
     """
+    maybe_enable_chunked_prefill(prefill_chunk_size, common_llm_kwargs)
     run_equality_correctness_test(vllm_runner,
                                   common_llm_kwargs,
                                   per_test_common_llm_kwargs,
diff --git a/tests/spec_decode/e2e/test_medusa_correctness.py b/tests/spec_decode/e2e/test_medusa_correctness.py
index b8965606b3d0e..807f41cc9e5c0 100644
--- a/tests/spec_decode/e2e/test_medusa_correctness.py
+++ b/tests/spec_decode/e2e/test_medusa_correctness.py
@@ -1,3 +1,4 @@
+# SPDX-License-Identifier: Apache-2.0
 """This docstring details important information on the testing methodology.
 
 Most of the tests rely on "greedy equality", where we expect the output of
@@ -21,6 +22,7 @@ correctess for the target model outputs.
 
 import pytest
 
+from ..utils import maybe_enable_chunked_prefill
 from .conftest import run_equality_correctness_test
 
 # main model
@@ -67,12 +69,14 @@ PRECISION = "float32"
 ])
 @pytest.mark.parametrize("batch_size", [1, 32])
 @pytest.mark.parametrize("seed", [1])
+@pytest.mark.parametrize("prefill_chunk_size", [-1, 32])
 def test_medusa_e2e_greedy_correctness(vllm_runner, common_llm_kwargs,
                                        per_test_common_llm_kwargs,
                                        baseline_llm_kwargs, test_llm_kwargs,
                                        batch_size: int, output_len: int,
-                                       seed: int):
+                                       seed: int, prefill_chunk_size: int):
     """Verify greedy equality with different batch size."""
+    maybe_enable_chunked_prefill(prefill_chunk_size, test_llm_kwargs)
     run_equality_correctness_test(vllm_runner,
                                   common_llm_kwargs,
                                   per_test_common_llm_kwargs,
@@ -119,12 +123,15 @@ def test_medusa_e2e_greedy_correctness(vllm_runner, common_llm_kwargs,
 @pytest.mark.parametrize("batch_size", [8])
 @pytest.mark.parametrize("seed", [1])
 @pytest.mark.parametrize("logprobs", [1, 6])
+@pytest.mark.parametrize("prefill_chunk_size", [-1, 32])
 def test_medusa_e2e_greedy_logprobs(vllm_runner, common_llm_kwargs,
                                     per_test_common_llm_kwargs,
                                     baseline_llm_kwargs, test_llm_kwargs,
                                     batch_size: int, output_len: int,
-                                    seed: int, logprobs: int):
+                                    seed: int, logprobs: int,
+                                    prefill_chunk_size: int):
     """Verify greedy equality with different batch size."""
+    maybe_enable_chunked_prefill(prefill_chunk_size, test_llm_kwargs)
     run_equality_correctness_test(vllm_runner,
                                   common_llm_kwargs,
                                   per_test_common_llm_kwargs,
@@ -167,12 +174,14 @@ def test_medusa_e2e_greedy_logprobs(vllm_runner, common_llm_kwargs,
 ])
 @pytest.mark.parametrize("batch_size", [1, 32])
 @pytest.mark.parametrize("seed", [1])
+@pytest.mark.parametrize("prefill_chunk_size", [-1, 32])
 def test_medusa_e2e_greedy_correctness_cuda_graph(
         vllm_runner, common_llm_kwargs, per_test_common_llm_kwargs,
         baseline_llm_kwargs, test_llm_kwargs, batch_size: int, output_len: int,
-        seed: int):
+        seed: int, prefill_chunk_size: int):
     """Verify greedy equality with cuda graph enabled and different 
     batch sizes."""
+    maybe_enable_chunked_prefill(prefill_chunk_size, test_llm_kwargs)
     run_equality_correctness_test(vllm_runner,
                                   common_llm_kwargs,
                                   per_test_common_llm_kwargs,
@@ -217,13 +226,15 @@ def test_medusa_e2e_greedy_correctness_cuda_graph(
     ])
 @pytest.mark.parametrize("batch_size", [4])
 @pytest.mark.parametrize("seed", [1])
+@pytest.mark.parametrize("prefill_chunk_size", [-1, 32])
 def test_medusa_e2e_greedy_correctness_with_preemption(
         vllm_runner, common_llm_kwargs, per_test_common_llm_kwargs,
         baseline_llm_kwargs, test_llm_kwargs, batch_size: int, output_len: int,
-        seed: int):
+        seed: int, prefill_chunk_size: int):
     """Verify greedy equality, even when some sequences are preempted mid-
     generation.
     """
+    maybe_enable_chunked_prefill(prefill_chunk_size, test_llm_kwargs)
     run_equality_correctness_test(vllm_runner,
                                   common_llm_kwargs,
                                   per_test_common_llm_kwargs,
@@ -267,13 +278,15 @@ def test_medusa_e2e_greedy_correctness_with_preemption(
         32,
     ])
 @pytest.mark.parametrize("seed", [1])
+@pytest.mark.parametrize("prefill_chunk_size", [-1, 32])
 def test_medusa_different_k(vllm_runner, common_llm_kwargs,
                             per_test_common_llm_kwargs, baseline_llm_kwargs,
                             test_llm_kwargs, batch_size: int, output_len: int,
-                            seed: int):
+                            seed: int, prefill_chunk_size: int):
     """Verify that medusa speculative decoding produces exact equality
     to without spec decode with different values of num_speculative_tokens.
     """
+    maybe_enable_chunked_prefill(prefill_chunk_size, test_llm_kwargs)
     run_equality_correctness_test(vllm_runner,
                                   common_llm_kwargs,
                                   per_test_common_llm_kwargs,
@@ -313,14 +326,17 @@ def test_medusa_different_k(vllm_runner, common_llm_kwargs,
         32,
     ])
 @pytest.mark.parametrize("seed", [1])
+@pytest.mark.parametrize("prefill_chunk_size", [-1, 32])
 def test_medusa_disable_queue(vllm_runner, common_llm_kwargs,
                               per_test_common_llm_kwargs, baseline_llm_kwargs,
                               test_llm_kwargs, batch_size: int,
-                              output_len: int, seed: int):
+                              output_len: int, seed: int,
+                              prefill_chunk_size: int):
     """Verify that medusa speculative decoding produces exact equality
     to without spec decode when speculation is disabled for large
     batch sizes.
     """
+    maybe_enable_chunked_prefill(prefill_chunk_size, test_llm_kwargs)
     run_equality_correctness_test(vllm_runner,
                                   common_llm_kwargs,
                                   per_test_common_llm_kwargs,
@@ -361,12 +377,14 @@ def test_medusa_disable_queue(vllm_runner, common_llm_kwargs,
         32,
     ])
 @pytest.mark.parametrize("seed", [1])
+@pytest.mark.parametrize("prefill_chunk_size", [-1, 32])
 def test_mqa_scorer(vllm_runner, common_llm_kwargs, per_test_common_llm_kwargs,
                     baseline_llm_kwargs, test_llm_kwargs, batch_size: int,
-                    output_len: int, seed: int):
+                    output_len: int, seed: int, prefill_chunk_size: int):
     """Verify that speculative decoding generates the same output 
     with batch expansion scorer and mqa scorer.
     """
+    maybe_enable_chunked_prefill(prefill_chunk_size, test_llm_kwargs)
     run_equality_correctness_test(vllm_runner,
                                   common_llm_kwargs,
                                   per_test_common_llm_kwargs,
diff --git a/tests/spec_decode/e2e/test_mlp_correctness.py b/tests/spec_decode/e2e/test_mlp_correctness.py
index 183ff2f5db274..a2b84b90222dc 100644
--- a/tests/spec_decode/e2e/test_mlp_correctness.py
+++ b/tests/spec_decode/e2e/test_mlp_correctness.py
@@ -1,3 +1,4 @@
+# SPDX-License-Identifier: Apache-2.0
 """This docstring details important information on the testing methodology.
 
 Most of the tests rely on "greedy equality", where we expect the output of
@@ -25,6 +26,7 @@ import pytest
 
 from vllm.model_executor.layers.vocab_parallel_embedding import pad_vocab_size
 
+from ..utils import maybe_enable_chunked_prefill
 from .conftest import run_equality_correctness_test
 
 # main model
@@ -66,14 +68,16 @@ PRECISION = "float32"
 @pytest.mark.parametrize("output_len", [
     128,
 ])
-@pytest.mark.parametrize("batch_size", [1, 32])
+@pytest.mark.parametrize("batch_size", [4, 32])
 @pytest.mark.parametrize("seed", [1])
+@pytest.mark.parametrize("prefill_chunk_size", [-1, 32])
 def test_mlp_e2e_greedy_correctness(vllm_runner, common_llm_kwargs,
                                     per_test_common_llm_kwargs,
                                     baseline_llm_kwargs, test_llm_kwargs,
                                     batch_size: int, output_len: int,
-                                    seed: int):
+                                    seed: int, prefill_chunk_size: int):
     """Verify greedy equality with different batch size."""
+    maybe_enable_chunked_prefill(prefill_chunk_size, test_llm_kwargs)
     run_equality_correctness_test(vllm_runner,
                                   common_llm_kwargs,
                                   per_test_common_llm_kwargs,
@@ -116,12 +120,19 @@ def test_mlp_e2e_greedy_correctness(vllm_runner, common_llm_kwargs,
 @pytest.mark.parametrize("batch_size", [8])
 @pytest.mark.parametrize("seed", [1])
 @pytest.mark.parametrize("logprobs", [1, 6])
+@pytest.mark.parametrize("prefill_chunk_size", [-1, 4])
 def test_mlp_e2e_greedy_logprobs(vllm_runner, common_llm_kwargs,
                                  per_test_common_llm_kwargs,
                                  baseline_llm_kwargs, test_llm_kwargs,
                                  batch_size: int, output_len: int, seed: int,
-                                 logprobs: int):
+                                 logprobs: int, prefill_chunk_size: int):
     """Verify greedy equality with different batch size."""
+    maybe_enable_chunked_prefill(prefill_chunk_size, test_llm_kwargs)
+    # NOTE Test is sensitive enough st if we don't enable chunked prefill
+    # scheduling on baseline too, we get slightly different logprobs, ending
+    # up sampling different tokens at the tail (ie top tokens don't change).
+    # TL;DR: sd+cp == org+cp but sd+cp != org..is this expected?
+    maybe_enable_chunked_prefill(prefill_chunk_size, baseline_llm_kwargs)
     run_equality_correctness_test(vllm_runner,
                                   common_llm_kwargs,
                                   per_test_common_llm_kwargs,
@@ -162,12 +173,15 @@ def test_mlp_e2e_greedy_logprobs(vllm_runner, common_llm_kwargs,
 @pytest.mark.parametrize("output_len", [2048])
 @pytest.mark.parametrize("batch_size", [1, 32])
 @pytest.mark.parametrize("seed", [1])
+@pytest.mark.parametrize("prefill_chunk_size", [-1, 4])
 def test_mlp_e2e_acceptance_rate(vllm_runner, common_llm_kwargs,
                                  per_test_common_llm_kwargs,
                                  baseline_llm_kwargs, test_llm_kwargs,
-                                 batch_size: int, output_len: int, seed: int):
+                                 batch_size: int, output_len: int,
+                                 prefill_chunk_size: int, seed: int):
     """Verify acceptance rate with different batch size and large output 
     length."""
+    maybe_enable_chunked_prefill(prefill_chunk_size, test_llm_kwargs)
     run_equality_correctness_test(vllm_runner,
                                   common_llm_kwargs,
                                   per_test_common_llm_kwargs,
@@ -204,13 +218,17 @@ def test_mlp_e2e_acceptance_rate(vllm_runner, common_llm_kwargs,
 @pytest.mark.parametrize("output_len", [64])
 @pytest.mark.parametrize("batch_size", [1, 32])
 @pytest.mark.parametrize("temperature", [1.0])
+@pytest.mark.parametrize("prefill_chunk_size", [-1, 4])
 @pytest.mark.parametrize("seed", [1])
 def test_mlp_e2e_seeded_correctness(vllm_runner, common_llm_kwargs,
                                     per_test_common_llm_kwargs,
                                     baseline_llm_kwargs, test_llm_kwargs,
                                     batch_size: int, output_len: int,
-                                    temperature: float, seed: int):
+                                    temperature: float,
+                                    prefill_chunk_size: int, seed: int):
     """Verify seeded runs produce the same output."""
+    maybe_enable_chunked_prefill(prefill_chunk_size, test_llm_kwargs)
+    maybe_enable_chunked_prefill(prefill_chunk_size, baseline_llm_kwargs)
     run_equality_correctness_test(vllm_runner,
                                   common_llm_kwargs,
                                   per_test_common_llm_kwargs,
@@ -266,14 +284,16 @@ def test_mlp_e2e_seeded_correctness(vllm_runner, common_llm_kwargs,
         128,
     ])
 @pytest.mark.parametrize("batch_size", [4])
+@pytest.mark.parametrize("prefill_chunk_size", [-1, 4])
 @pytest.mark.parametrize("seed", [1])
 def test_mlp_e2e_greedy_correctness_with_preemption(
         vllm_runner, common_llm_kwargs, per_test_common_llm_kwargs,
         baseline_llm_kwargs, test_llm_kwargs, batch_size: int, output_len: int,
-        seed: int):
+        prefill_chunk_size: int, seed: int):
     """Verify greedy equality, even when some sequences are preempted mid-
     generation.
     """
+    maybe_enable_chunked_prefill(prefill_chunk_size, test_llm_kwargs)
     run_equality_correctness_test(vllm_runner,
                                   common_llm_kwargs,
                                   per_test_common_llm_kwargs,
@@ -317,12 +337,14 @@ def test_mlp_e2e_greedy_correctness_with_preemption(
     ])
 @pytest.mark.parametrize("batch_size", [4])
 @pytest.mark.parametrize("seed", [1])
+@pytest.mark.parametrize("prefill_chunk_size", [-1, 4])
 def test_mlp_e2e_greedy_correctness_with_padding(
         vllm_runner, common_llm_kwargs, per_test_common_llm_kwargs,
         baseline_llm_kwargs, test_llm_kwargs, batch_size: int, output_len: int,
-        seed: int):
+        prefill_chunk_size: int, seed: int):
     """Verify greedy equality when the vocab dimension is padded
     """
+    maybe_enable_chunked_prefill(prefill_chunk_size, test_llm_kwargs)
 
     # Default pad_to is 64, test model has vocab_size of 32000
     def patched_pad_vocab_size(vocab_size, pad_to=None):
@@ -373,14 +395,16 @@ def test_mlp_e2e_greedy_correctness_with_padding(
         # Use smaller output len for fast test.
         32,
     ])
+@pytest.mark.parametrize("prefill_chunk_size", [-1, 4])
 @pytest.mark.parametrize("seed", [1])
 def test_mlp_different_k(vllm_runner, common_llm_kwargs,
                          per_test_common_llm_kwargs, baseline_llm_kwargs,
-                         test_llm_kwargs, batch_size: int, seed: int,
-                         output_len: int):
+                         test_llm_kwargs, batch_size: int,
+                         prefill_chunk_size: int, seed: int, output_len: int):
     """Verify that mlp speculative decoding produces exact equality
     to without spec decode with different values of num_speculative_tokens.
     """
+    maybe_enable_chunked_prefill(prefill_chunk_size, test_llm_kwargs)
     run_equality_correctness_test(vllm_runner,
                                   common_llm_kwargs,
                                   per_test_common_llm_kwargs,
@@ -418,15 +442,21 @@ def test_mlp_different_k(vllm_runner, common_llm_kwargs,
         # Use smaller output len for fast test.
         32,
     ])
+# Speculative decoding is disabled when sequences reach decoding and the batch
+# consists of single-token requests. Hence we set `max_num_seqs`
+# >= `speculative_disable_by_batch_size` to test feature interaction.
+@pytest.mark.parametrize("prefill_chunk_size", [-1, 4])
 @pytest.mark.parametrize("seed", [1])
 def test_mlp_disable_queue(vllm_runner, common_llm_kwargs,
                            per_test_common_llm_kwargs, baseline_llm_kwargs,
-                           test_llm_kwargs, batch_size: int, seed: int,
+                           test_llm_kwargs, batch_size: int,
+                           prefill_chunk_size: int, seed: int,
                            output_len: int):
     """Verify that mlp speculative decoding produces exact equality
     to without spec decode when speculation is disabled for large
     batch sizes.
     """
+    maybe_enable_chunked_prefill(prefill_chunk_size, test_llm_kwargs)
     run_equality_correctness_test(vllm_runner,
                                   common_llm_kwargs,
                                   per_test_common_llm_kwargs,
@@ -460,13 +490,15 @@ def test_mlp_disable_queue(vllm_runner, common_llm_kwargs,
         # Use smaller output len for fast test.
         32,
     ])
+@pytest.mark.parametrize("prefill_chunk_size", [-1, 4])
 @pytest.mark.parametrize("seed", [1])
 def test_mqa_scorer(vllm_runner, common_llm_kwargs, per_test_common_llm_kwargs,
                     baseline_llm_kwargs, test_llm_kwargs, batch_size: int,
-                    output_len: int, seed: int):
+                    output_len: int, prefill_chunk_size: int, seed: int):
     """Verify that speculative decoding generates the same output 
     with batch expansion scorer and mqa scorer.
     """
+    maybe_enable_chunked_prefill(prefill_chunk_size, test_llm_kwargs)
     run_equality_correctness_test(vllm_runner,
                                   common_llm_kwargs,
                                   per_test_common_llm_kwargs,
diff --git a/tests/spec_decode/e2e/test_multistep_correctness.py b/tests/spec_decode/e2e/test_multistep_correctness.py
index a13cca41f99e5..d396e52a9ddc3 100644
--- a/tests/spec_decode/e2e/test_multistep_correctness.py
+++ b/tests/spec_decode/e2e/test_multistep_correctness.py
@@ -1,3 +1,4 @@
+# SPDX-License-Identifier: Apache-2.0
 """The tests in this file verify end-to-end speculative decoding correctness.
 
 This docstring details important information on the testing methodology.
@@ -147,20 +148,20 @@ def test_spec_decode_e2e_with_detokenization(test_llm_generator,
         },
     ])
 @pytest.mark.parametrize("baseline_llm_kwargs", [{}])
-@pytest.mark.parametrize("test_llm_kwargs", [
-    {
-        "speculative_model": "JackFram/llama-68m",
-        "num_speculative_tokens": 5,
-        "enable_chunked_prefill": False,
-    },
-    {
-        "speculative_model": "JackFram/llama-68m",
-        "num_speculative_tokens": 5,
-        "enable_chunked_prefill": True,
-        "max_num_batched_tokens": 4,
-        "max_num_seqs": 4,
-    },
-])
+@pytest.mark.parametrize("test_llm_kwargs",
+                         [{
+                             "speculative_model": "JackFram/llama-68m",
+                             "num_speculative_tokens": 5,
+                             "enable_chunked_prefill": False,
+                             "disable_logprobs_during_spec_decoding": False
+                         }, {
+                             "speculative_model": "JackFram/llama-68m",
+                             "num_speculative_tokens": 3,
+                             "enable_chunked_prefill": True,
+                             "max_num_batched_tokens": 4,
+                             "max_num_seqs": 4,
+                             "disable_logprobs_during_spec_decoding": False
+                         }])
 @pytest.mark.parametrize(
     "output_len",
     [
@@ -192,6 +193,9 @@ def test_spec_decode_e2e_greedy_correctness_tiny_model_bs1(
                                   batch_size,
                                   max_output_len=output_len,
                                   seed=seed,
+                                  prompt_logprobs=2,
+                                  logprobs=2,
+                                  disable_logprobs=False,
                                   temperature=0.0,
                                   ensure_all_accepted=ensure_all_accepted)
 
diff --git a/tests/spec_decode/e2e/test_ngram_correctness.py b/tests/spec_decode/e2e/test_ngram_correctness.py
index e53d169a8fcc3..1aff53cb55c94 100644
--- a/tests/spec_decode/e2e/test_ngram_correctness.py
+++ b/tests/spec_decode/e2e/test_ngram_correctness.py
@@ -1,3 +1,4 @@
+# SPDX-License-Identifier: Apache-2.0
 """This docstring details important information on the testing methodology.
 
 Most of the tests rely on "greedy equality", where we expect the output of
@@ -26,6 +27,7 @@ for the target model outputs.
 
 import pytest
 
+from ..utils import maybe_enable_chunked_prefill
 from .conftest import run_equality_correctness_test
 
 
@@ -49,11 +51,13 @@ from .conftest import run_equality_correctness_test
         "speculative_model": "[ngram]",
         "num_speculative_tokens": 5,
         "ngram_prompt_lookup_max": 3,
+        "speculative_disable_mqa_scorer": False,
     },
     {
         "speculative_model": "[ngram]",
         "num_speculative_tokens": 5,
         "ngram_prompt_lookup_max": 3,
+        "speculative_disable_mqa_scorer": True,
     },
 ])
 @pytest.mark.parametrize("output_len", [
@@ -68,15 +72,7 @@ def test_ngram_e2e_greedy_correctness(vllm_runner, common_llm_kwargs,
                                       batch_size: int, output_len: int,
                                       prefill_chunk_size: int, seed: int):
     """Verify greedy equality on a tiny model with different batch size."""
-    if prefill_chunk_size > 0:
-        common_llm_kwargs.update(
-            **{
-                "enable_chunked_prefill": True,
-                "max_num_batched_tokens": prefill_chunk_size,
-                "max_num_seqs": prefill_chunk_size
-            })
-    else:
-        common_llm_kwargs["enable_chunked_prefill"] = False
+    maybe_enable_chunked_prefill(prefill_chunk_size, common_llm_kwargs)
     run_equality_correctness_test(vllm_runner,
                                   common_llm_kwargs,
                                   per_test_common_llm_kwargs,
diff --git a/tests/spec_decode/e2e/test_seed.py b/tests/spec_decode/e2e/test_seed.py
index e42cf416b159f..b7d279f2919be 100644
--- a/tests/spec_decode/e2e/test_seed.py
+++ b/tests/spec_decode/e2e/test_seed.py
@@ -1,3 +1,5 @@
+# SPDX-License-Identifier: Apache-2.0
+
 import pytest
 
 from .conftest import run_equality_correctness_test
diff --git a/tests/spec_decode/test_batch_expansion.py b/tests/spec_decode/test_batch_expansion.py
index 3504fcf43e361..fe95ff9b9c35a 100644
--- a/tests/spec_decode/test_batch_expansion.py
+++ b/tests/spec_decode/test_batch_expansion.py
@@ -1,3 +1,5 @@
+# SPDX-License-Identifier: Apache-2.0
+
 from typing import List
 
 import pytest
diff --git a/tests/spec_decode/test_dynamic_spec_decode.py b/tests/spec_decode/test_dynamic_spec_decode.py
index aa49a3aee62aa..0bff0ea1d7dba 100644
--- a/tests/spec_decode/test_dynamic_spec_decode.py
+++ b/tests/spec_decode/test_dynamic_spec_decode.py
@@ -1,3 +1,5 @@
+# SPDX-License-Identifier: Apache-2.0
+
 from unittest.mock import MagicMock, patch
 
 import pytest
diff --git a/tests/spec_decode/test_metrics.py b/tests/spec_decode/test_metrics.py
index 7477486a3388d..1a6693e168173 100644
--- a/tests/spec_decode/test_metrics.py
+++ b/tests/spec_decode/test_metrics.py
@@ -1,3 +1,5 @@
+# SPDX-License-Identifier: Apache-2.0
+
 import math
 from unittest.mock import MagicMock
 
diff --git a/tests/spec_decode/test_multi_step_worker.py b/tests/spec_decode/test_multi_step_worker.py
index 0b5d82b6610ca..2bf401613f063 100644
--- a/tests/spec_decode/test_multi_step_worker.py
+++ b/tests/spec_decode/test_multi_step_worker.py
@@ -1,3 +1,5 @@
+# SPDX-License-Identifier: Apache-2.0
+
 import random
 from typing import Dict, List
 from unittest.mock import MagicMock
diff --git a/tests/spec_decode/test_ngram_worker.py b/tests/spec_decode/test_ngram_worker.py
index f66e957186604..7de54b3edb6c6 100644
--- a/tests/spec_decode/test_ngram_worker.py
+++ b/tests/spec_decode/test_ngram_worker.py
@@ -1,3 +1,5 @@
+# SPDX-License-Identifier: Apache-2.0
+
 import torch
 
 from vllm.sequence import ExecuteModelRequest
diff --git a/tests/spec_decode/test_scorer.py b/tests/spec_decode/test_scorer.py
index 0b1509d8b7785..7bbbb0236da1e 100644
--- a/tests/spec_decode/test_scorer.py
+++ b/tests/spec_decode/test_scorer.py
@@ -1,3 +1,5 @@
+# SPDX-License-Identifier: Apache-2.0
+
 import random
 from typing import List
 
@@ -60,6 +62,7 @@ def test_scorer(model_name: str, batch_size: int, max_propose_len: int,
     num_gpu_blocks = 2048 // block_size
     scorer_worker = create_worker(Worker, model_name, block_size,
                                   num_gpu_blocks, seed)
+    scorer_worker.model_runner.disable_logprobs = True  # accessed by mqa_scorer
     scorer_worker.model_runner.model.sampler.include_gpu_probs_tensor = True
     scorer_worker.model_runner.model.sampler.\
         should_modify_greedy_probs_inplace = True
diff --git a/tests/spec_decode/test_spec_decode_worker.py b/tests/spec_decode/test_spec_decode_worker.py
index caf7a7e625b46..eee0f4c89c898 100644
--- a/tests/spec_decode/test_spec_decode_worker.py
+++ b/tests/spec_decode/test_spec_decode_worker.py
@@ -1,3 +1,5 @@
+# SPDX-License-Identifier: Apache-2.0
+
 import random
 from collections import defaultdict
 from types import SimpleNamespace
@@ -754,6 +756,7 @@ def test_populate_seq_ids_with_bonus_tokens():
         seq_group_metadata_list=seq_group_metadata_list,
         accepted_token_ids=accepted_token_ids,
         target_logprobs=target_token_logprobs,
+        prompt_logprobs=None,
         k=k,
         stage_times=(0, 0, 0))
     # Verify that _seq_with_bonus_token_in_last_step contains the following:
diff --git a/tests/spec_decode/test_utils.py b/tests/spec_decode/test_utils.py
index 195fce64822bd..24573e22487d0 100644
--- a/tests/spec_decode/test_utils.py
+++ b/tests/spec_decode/test_utils.py
@@ -1,3 +1,5 @@
+# SPDX-License-Identifier: Apache-2.0
+
 from unittest.mock import MagicMock
 
 import pytest
diff --git a/tests/spec_decode/utils.py b/tests/spec_decode/utils.py
index a4bfa6b2f384b..38f57e99bdb0d 100644
--- a/tests/spec_decode/utils.py
+++ b/tests/spec_decode/utils.py
@@ -1,3 +1,5 @@
+# SPDX-License-Identifier: Apache-2.0
+
 from itertools import count
 from typing import Callable, Dict, List, Optional
 from typing import Sequence as GenericSequence
@@ -274,3 +276,15 @@ def create_batch(batch_size,
             prompts, num_gpu_blocks, block_size, final_prompt_lens,
             prev_output_tokens, seq_ids)
     return seq_group_metadata_list, prompts, prev_output_tokens
+
+
+def maybe_enable_chunked_prefill(prefill_chunk_size, llm_kwargs):
+    if prefill_chunk_size > 0:
+        llm_kwargs.update(
+            **{
+                "enable_chunked_prefill": True,
+                "max_num_batched_tokens": prefill_chunk_size,
+                "max_num_seqs": prefill_chunk_size
+            })
+    else:
+        llm_kwargs["enable_chunked_prefill"] = False
diff --git a/tests/standalone_tests/lazy_torch_compile.py b/tests/standalone_tests/lazy_imports.py
similarity index 53%
rename from tests/standalone_tests/lazy_torch_compile.py
rename to tests/standalone_tests/lazy_imports.py
index b950877a4337b..61e3b387973bc 100644
--- a/tests/standalone_tests/lazy_torch_compile.py
+++ b/tests/standalone_tests/lazy_imports.py
@@ -1,3 +1,5 @@
+# SPDX-License-Identifier: Apache-2.0
+
 # Description: Test the lazy import module
 # The utility function cannot be placed in `vllm.utils`
 # this needs to be a standalone script
@@ -6,7 +8,17 @@ from contextlib import nullcontext
 
 from vllm_test_utils import BlameResult, blame
 
-module_name = "torch._inductor.async_compile"
+# List of modules that should not be imported too early.
+# Lazy import `torch._inductor.async_compile` to avoid creating
+# too many processes before we set the number of compiler threads.
+# Lazy import `cv2` to avoid bothering users who only use text models.
+# `cv2` can easily mess up the environment.
+module_names = ["torch._inductor.async_compile", "cv2"]
+
+
+def any_module_imported():
+    return any(module_name in sys.modules for module_name in module_names)
+
 
 # In CI, we only check finally if the module is imported.
 # If it is indeed imported, we can rerun the test with `use_blame=True`,
@@ -14,8 +26,7 @@ module_name = "torch._inductor.async_compile"
 # and help find the root cause.
 # We don't run it in CI by default because it is slow.
 use_blame = False
-context = blame(
-    lambda: module_name in sys.modules) if use_blame else nullcontext()
+context = blame(any_module_imported) if use_blame else nullcontext()
 with context as result:
     import vllm  # noqa
 
@@ -23,6 +34,6 @@ if use_blame:
     assert isinstance(result, BlameResult)
     print(f"the first import location is:\n{result.trace_stack}")
 
-assert module_name not in sys.modules, (
-    f"Module {module_name} is imported. To see the first"
+assert not any_module_imported(), (
+    f"Some the modules in {module_names} are imported. To see the first"
     f" import location, run the test with `use_blame=True`.")
diff --git a/tests/tensorizer_loader/conftest.py b/tests/tensorizer_loader/conftest.py
index 2a45653622448..694bb5fbc3f71 100644
--- a/tests/tensorizer_loader/conftest.py
+++ b/tests/tensorizer_loader/conftest.py
@@ -1,3 +1,5 @@
+# SPDX-License-Identifier: Apache-2.0
+
 import functools
 import gc
 from typing import Callable, TypeVar
diff --git a/tests/tensorizer_loader/test_tensorizer.py b/tests/tensorizer_loader/test_tensorizer.py
index 6e7eec1c6ab34..b268d4bf0c4c7 100644
--- a/tests/tensorizer_loader/test_tensorizer.py
+++ b/tests/tensorizer_loader/test_tensorizer.py
@@ -1,3 +1,5 @@
+# SPDX-License-Identifier: Apache-2.0
+
 import gc
 import json
 import os
diff --git a/tests/test_cache_block_hashing.py b/tests/test_cache_block_hashing.py
index e8f8499aa88ca..17c128a176563 100644
--- a/tests/test_cache_block_hashing.py
+++ b/tests/test_cache_block_hashing.py
@@ -1,3 +1,4 @@
+# SPDX-License-Identifier: Apache-2.0
 """Test hashing of cache blocks.
 
 Run `pytest tests/test_cache_block_hashing.py`.
diff --git a/tests/test_config.py b/tests/test_config.py
index 4518adfc31bfc..2dfae218b47d4 100644
--- a/tests/test_config.py
+++ b/tests/test_config.py
@@ -1,3 +1,5 @@
+# SPDX-License-Identifier: Apache-2.0
+
 from dataclasses import asdict
 
 import pytest
@@ -281,3 +283,73 @@ def test_uses_mrope(model_id, uses_mrope):
     )
 
     assert config.uses_mrope == uses_mrope
+
+
+def test_generation_config_loading():
+    model_id = "Qwen/Qwen2.5-1.5B-Instruct"
+
+    # When set generation_config to None, the default generation config
+    # will not be loaded.
+    model_config = ModelConfig(model_id,
+                               task="auto",
+                               tokenizer=model_id,
+                               tokenizer_mode="auto",
+                               trust_remote_code=False,
+                               seed=0,
+                               dtype="float16",
+                               generation_config=None)
+    assert model_config.get_diff_sampling_param() == {}
+
+    # When set generation_config to "auto", the default generation config
+    # should be loaded.
+    model_config = ModelConfig(model_id,
+                               task="auto",
+                               tokenizer=model_id,
+                               tokenizer_mode="auto",
+                               trust_remote_code=False,
+                               seed=0,
+                               dtype="float16",
+                               generation_config="auto")
+
+    correct_generation_config = {
+        "repetition_penalty": 1.1,
+        "temperature": 0.7,
+        "top_p": 0.8,
+        "top_k": 20,
+    }
+
+    assert model_config.get_diff_sampling_param() == correct_generation_config
+
+    # The generation config could be overridden by the user.
+    override_generation_config = {"temperature": 0.5, "top_k": 5}
+
+    model_config = ModelConfig(
+        model_id,
+        task="auto",
+        tokenizer=model_id,
+        tokenizer_mode="auto",
+        trust_remote_code=False,
+        seed=0,
+        dtype="float16",
+        generation_config="auto",
+        override_generation_config=override_generation_config)
+
+    override_result = correct_generation_config.copy()
+    override_result.update(override_generation_config)
+
+    assert model_config.get_diff_sampling_param() == override_result
+
+    # When generation_config is set to None and override_generation_config
+    # is set, the override_generation_config should be used directly.
+    model_config = ModelConfig(
+        model_id,
+        task="auto",
+        tokenizer=model_id,
+        tokenizer_mode="auto",
+        trust_remote_code=False,
+        seed=0,
+        dtype="float16",
+        generation_config=None,
+        override_generation_config=override_generation_config)
+
+    assert model_config.get_diff_sampling_param() == override_generation_config
diff --git a/tests/test_embedded_commit.py b/tests/test_embedded_commit.py
index ffeacf34b7baf..a9b4f5cbf78c3 100644
--- a/tests/test_embedded_commit.py
+++ b/tests/test_embedded_commit.py
@@ -1,3 +1,5 @@
+# SPDX-License-Identifier: Apache-2.0
+
 import vllm
 
 
diff --git a/tests/test_inputs.py b/tests/test_inputs.py
index fff7c5fc04285..fff909154a2ae 100644
--- a/tests/test_inputs.py
+++ b/tests/test_inputs.py
@@ -1,3 +1,5 @@
+# SPDX-License-Identifier: Apache-2.0
+
 from typing import List
 
 import pytest
diff --git a/tests/test_logger.py b/tests/test_logger.py
index e3749616d4203..993822e922405 100644
--- a/tests/test_logger.py
+++ b/tests/test_logger.py
@@ -1,3 +1,5 @@
+# SPDX-License-Identifier: Apache-2.0
+
 import json
 import logging
 import os
diff --git a/tests/test_logits_processor.py b/tests/test_logits_processor.py
index 39c1c38151fd0..487fbb8fcb8c8 100644
--- a/tests/test_logits_processor.py
+++ b/tests/test_logits_processor.py
@@ -1,3 +1,5 @@
+# SPDX-License-Identifier: Apache-2.0
+
 import random
 from typing import Tuple
 from unittest.mock import patch
diff --git a/tests/test_regression.py b/tests/test_regression.py
index 5d27d35793017..f781b3113b4c5 100644
--- a/tests/test_regression.py
+++ b/tests/test_regression.py
@@ -1,3 +1,4 @@
+# SPDX-License-Identifier: Apache-2.0
 """Containing tests that check for regressions in vLLM's behavior.
 
 It should include tests that are reported by users and making sure they
diff --git a/tests/test_sampling_params.py b/tests/test_sampling_params.py
index 01cbe0c997f29..40e26ed5199c1 100644
--- a/tests/test_sampling_params.py
+++ b/tests/test_sampling_params.py
@@ -1,3 +1,4 @@
+# SPDX-License-Identifier: Apache-2.0
 """Tests for the SamplingParams class.
 """
 from vllm import SamplingParams
diff --git a/tests/test_scalartype.py b/tests/test_scalartype.py
index a9221f08c2946..6e36f2c337f32 100644
--- a/tests/test_scalartype.py
+++ b/tests/test_scalartype.py
@@ -1,3 +1,5 @@
+# SPDX-License-Identifier: Apache-2.0
+
 import pytest
 import torch
 
diff --git a/tests/test_sequence.py b/tests/test_sequence.py
index 30e53a180ea31..902de1099e605 100644
--- a/tests/test_sequence.py
+++ b/tests/test_sequence.py
@@ -1,3 +1,5 @@
+# SPDX-License-Identifier: Apache-2.0
+
 import pytest
 
 from vllm.model_executor.layers.sampler import SamplerOutput
diff --git a/tests/test_sharded_state_loader.py b/tests/test_sharded_state_loader.py
index 2412da5037ece..088b95be721fa 100644
--- a/tests/test_sharded_state_loader.py
+++ b/tests/test_sharded_state_loader.py
@@ -1,3 +1,5 @@
+# SPDX-License-Identifier: Apache-2.0
+
 import multiprocessing as mp
 import os
 import shutil
diff --git a/tests/test_utils.py b/tests/test_utils.py
index d5dc4464e634d..5b69ffd18bb28 100644
--- a/tests/test_utils.py
+++ b/tests/test_utils.py
@@ -1,3 +1,5 @@
+# SPDX-License-Identifier: Apache-2.0
+
 import asyncio
 import os
 import socket
diff --git a/tests/tokenization/test_cached_tokenizer.py b/tests/tokenization/test_cached_tokenizer.py
index 4c8238fd8d113..cd60cefd7ccde 100644
--- a/tests/tokenization/test_cached_tokenizer.py
+++ b/tests/tokenization/test_cached_tokenizer.py
@@ -1,3 +1,5 @@
+# SPDX-License-Identifier: Apache-2.0
+
 from copy import deepcopy
 
 from transformers import AutoTokenizer
diff --git a/tests/tokenization/test_detokenize.py b/tests/tokenization/test_detokenize.py
index 84348cbc0bced..57832394d0fc9 100644
--- a/tests/tokenization/test_detokenize.py
+++ b/tests/tokenization/test_detokenize.py
@@ -1,3 +1,5 @@
+# SPDX-License-Identifier: Apache-2.0
+
 from typing import Any, Dict, Generator, List, Optional
 
 import pytest
diff --git a/tests/tokenization/test_get_eos.py b/tests/tokenization/test_get_eos.py
index 875ca19d3b4b7..787fb6ea63f48 100644
--- a/tests/tokenization/test_get_eos.py
+++ b/tests/tokenization/test_get_eos.py
@@ -1,3 +1,4 @@
+# SPDX-License-Identifier: Apache-2.0
 """
 This test file includes some cases where it is inappropriate to
 only get the `eos_token_id` from the tokenizer as defined by
diff --git a/tests/tokenization/test_tokenizer.py b/tests/tokenization/test_tokenizer.py
index 8db7204f15d4e..eddc630986ea5 100644
--- a/tests/tokenization/test_tokenizer.py
+++ b/tests/tokenization/test_tokenizer.py
@@ -1,3 +1,5 @@
+# SPDX-License-Identifier: Apache-2.0
+
 import pytest
 from transformers import PreTrainedTokenizerBase
 
diff --git a/tests/tokenization/test_tokenizer_group.py b/tests/tokenization/test_tokenizer_group.py
index 3faaf326f5422..8e99f86917b88 100644
--- a/tests/tokenization/test_tokenizer_group.py
+++ b/tests/tokenization/test_tokenizer_group.py
@@ -1,3 +1,5 @@
+# SPDX-License-Identifier: Apache-2.0
+
 import asyncio
 import os
 import sys
diff --git a/tests/tool_use/conftest.py b/tests/tool_use/conftest.py
index 294acf202a232..39ab01c9b8741 100644
--- a/tests/tool_use/conftest.py
+++ b/tests/tool_use/conftest.py
@@ -1,3 +1,5 @@
+# SPDX-License-Identifier: Apache-2.0
+
 import pytest
 import pytest_asyncio
 from huggingface_hub import snapshot_download
diff --git a/tests/tool_use/test_chat_completion_request_validations.py b/tests/tool_use/test_chat_completion_request_validations.py
index 3d0fe8f060895..7bee56281c7d6 100644
--- a/tests/tool_use/test_chat_completion_request_validations.py
+++ b/tests/tool_use/test_chat_completion_request_validations.py
@@ -1,3 +1,5 @@
+# SPDX-License-Identifier: Apache-2.0
+
 import pytest
 
 from vllm.entrypoints.openai.protocol import ChatCompletionRequest
diff --git a/tests/tool_use/test_chat_completions.py b/tests/tool_use/test_chat_completions.py
index 75bbfbb766931..da033fa1d85c3 100644
--- a/tests/tool_use/test_chat_completions.py
+++ b/tests/tool_use/test_chat_completions.py
@@ -1,3 +1,5 @@
+# SPDX-License-Identifier: Apache-2.0
+
 from typing import List
 
 import openai
diff --git a/tests/tool_use/test_jamba_tool_parser.py b/tests/tool_use/test_jamba_tool_parser.py
index 3095ef4516796..7e349c51253c6 100644
--- a/tests/tool_use/test_jamba_tool_parser.py
+++ b/tests/tool_use/test_jamba_tool_parser.py
@@ -1,3 +1,5 @@
+# SPDX-License-Identifier: Apache-2.0
+
 import json
 from typing import Generator, List, Optional
 
diff --git a/tests/tool_use/test_parallel_tool_calls.py b/tests/tool_use/test_parallel_tool_calls.py
index c294cb04919fa..b49a5e8e7e4c7 100644
--- a/tests/tool_use/test_parallel_tool_calls.py
+++ b/tests/tool_use/test_parallel_tool_calls.py
@@ -1,3 +1,5 @@
+# SPDX-License-Identifier: Apache-2.0
+
 import json
 from typing import Dict, List, Optional
 
diff --git a/tests/tool_use/test_tool_calls.py b/tests/tool_use/test_tool_calls.py
index fe8cb496c9741..45f1bfc45bd78 100644
--- a/tests/tool_use/test_tool_calls.py
+++ b/tests/tool_use/test_tool_calls.py
@@ -1,3 +1,5 @@
+# SPDX-License-Identifier: Apache-2.0
+
 import json
 from typing import Dict, List, Optional
 
diff --git a/tests/tool_use/utils.py b/tests/tool_use/utils.py
index 2241f1846e746..a7dfb10780a38 100644
--- a/tests/tool_use/utils.py
+++ b/tests/tool_use/utils.py
@@ -1,3 +1,5 @@
+# SPDX-License-Identifier: Apache-2.0
+
 from copy import deepcopy
 from typing import Any, Dict, List, Optional
 
diff --git a/tests/tpu/test_compilation.py b/tests/tpu/test_compilation.py
index b7124ebc1b0f3..6ed83f30ee026 100644
--- a/tests/tpu/test_compilation.py
+++ b/tests/tpu/test_compilation.py
@@ -1,3 +1,5 @@
+# SPDX-License-Identifier: Apache-2.0
+
 import glob
 import os
 import tempfile
diff --git a/tests/tpu/test_custom_dispatcher.py b/tests/tpu/test_custom_dispatcher.py
index bb1379deba3fc..e94bbd2877225 100644
--- a/tests/tpu/test_custom_dispatcher.py
+++ b/tests/tpu/test_custom_dispatcher.py
@@ -1,3 +1,5 @@
+# SPDX-License-Identifier: Apache-2.0
+
 import os
 
 from vllm.config import CompilationLevel
diff --git a/tests/tpu/test_quantization_accuracy.py b/tests/tpu/test_quantization_accuracy.py
index 6cd5615c44e1e..3db9bc73aa876 100644
--- a/tests/tpu/test_quantization_accuracy.py
+++ b/tests/tpu/test_quantization_accuracy.py
@@ -1,3 +1,5 @@
+# SPDX-License-Identifier: Apache-2.0
+
 from dataclasses import dataclass
 
 import lm_eval
diff --git a/tests/tracing/test_tracing.py b/tests/tracing/test_tracing.py
index 49a16d16eb840..592775e8b8926 100644
--- a/tests/tracing/test_tracing.py
+++ b/tests/tracing/test_tracing.py
@@ -1,3 +1,5 @@
+# SPDX-License-Identifier: Apache-2.0
+
 import os
 import threading
 from concurrent import futures
diff --git a/tests/utils.py b/tests/utils.py
index f4eecf19e8c64..3b32052fe4c87 100644
--- a/tests/utils.py
+++ b/tests/utils.py
@@ -1,3 +1,5 @@
+# SPDX-License-Identifier: Apache-2.0
+
 import asyncio
 import copy
 import functools
diff --git a/tests/v1/core/test_kv_cache_utils.py b/tests/v1/core/test_kv_cache_utils.py
index f4081766e39a2..60cf4384d3fde 100644
--- a/tests/v1/core/test_kv_cache_utils.py
+++ b/tests/v1/core/test_kv_cache_utils.py
@@ -1,3 +1,5 @@
+# SPDX-License-Identifier: Apache-2.0
+
 import pytest
 
 from vllm.multimodal.inputs import MultiModalKwargs
@@ -192,7 +194,7 @@ def test_hash_block_tokens():
                                    extra_keys)
     assert isinstance(block_hash, BlockHashType)
     assert block_hash.hash_value == hash(
-        (parent_block_hash, *curr_block_token_ids))
+        (parent_block_hash, curr_block_token_ids, extra_keys))
     assert block_hash.token_ids == curr_block_token_ids
     assert block_hash.extra_keys == extra_keys
 
@@ -227,6 +229,38 @@ def test_hash_request_tokens():
     assert block_hashes[1].extra_keys == ("hash2", )
 
 
+def test_hash_tokens_different_mm_input():
+    request1 = make_request(
+        request_id=0,
+        prompt_token_ids=[_ for _ in range(6)],
+        mm_positions=[{
+            "offset": 0,
+            "length": 3
+        }, {
+            "offset": 3,
+            "length": 3
+        }],
+        mm_hashes=["hash1", "hash2"],
+    )
+    request2 = make_request(
+        request_id=1,
+        prompt_token_ids=[_ for _ in range(6)],
+        mm_positions=[{
+            "offset": 0,
+            "length": 3
+        }, {
+            "offset": 3,
+            "length": 3
+        }],
+        mm_hashes=["hash3", "hash2"],
+    )
+    block_size = 3
+    block_hashes1 = hash_request_tokens(block_size, request1)
+    block_hashes2 = hash_request_tokens(block_size, request2)
+    assert block_hashes1[0] != block_hashes2[0]
+    assert block_hashes1[1] != block_hashes2[1]
+
+
 def test_hash_request_tokens_no_mm_inputs():
     request = make_request(
         request_id=0,
diff --git a/tests/v1/core/test_prefix_caching.py b/tests/v1/core/test_prefix_caching.py
index f434fa8c61a80..2e16d7d2502e7 100644
--- a/tests/v1/core/test_prefix_caching.py
+++ b/tests/v1/core/test_prefix_caching.py
@@ -1,3 +1,4 @@
+# SPDX-License-Identifier: Apache-2.0
 """Compare the with and without prefix caching."""
 import pytest
 
@@ -164,7 +165,7 @@ def test_decode():
     req0.num_computed_tokens = 55
     for _ in range(4):
         req0.append_output_token_ids(8)
-    new_blocks = manager.append_slots(req0, 4)
+    new_blocks = manager.allocate_slots(req0, 4)
     assert new_blocks is not None and len(new_blocks) == 0
     assert manager.req_to_blocks[req0.request_id][-2].block_hash is None
 
@@ -175,7 +176,7 @@ def test_decode():
     # the preallocated block.
     for _ in range(5 + 10):
         req0.append_output_token_ids(7)
-    new_blocks = manager.append_slots(req0, 15)
+    new_blocks = manager.allocate_slots(req0, 15)
     assert new_blocks is not None and len(new_blocks) == 0
     assert manager.req_to_blocks[req0.request_id][-2].block_hash is not None
 
@@ -185,7 +186,7 @@ def test_decode():
     # the preallocated block.
     for _ in range(6 + 11):
         req0.append_output_token_ids(12)
-    new_blocks = manager.append_slots(req0, 17)
+    new_blocks = manager.allocate_slots(req0, 17)
     # Plus one preallocated block.
     assert new_blocks is not None and len(new_blocks) == 2
 
@@ -395,12 +396,14 @@ def test_preallocate_blocks(num_preallocate_tokens: int, block_size: int):
     req.num_computed_tokens = block_size
     assert len(blocks) == 1 + num_preallocated_blocks
 
-    # Assume all computed.
-    manager.append_slots(req, block_size * (len(blocks) - 1))
-    req.num_computed_tokens = block_size * len(blocks)
+    # Assume all computed, only when num_preallocate_tokens > 0, we need to
+    # consume the previously preallocated blocks.
+    if num_preallocated_blocks > 0:
+        manager.allocate_slots(req, block_size * (len(blocks) - 1))
+        req.num_computed_tokens = block_size * len(blocks)
 
     # Append 1 block.
-    blocks = manager.append_slots(req, block_size)
+    blocks = manager.allocate_slots(req, block_size)
     assert len(blocks) == 1 + num_preallocated_blocks
 
 
@@ -503,7 +506,7 @@ def test_mm_prefix_caching():
     # Append slots without allocating a new block.
     for _ in range(5):
         req0.append_output_token_ids(8)
-    new_blocks = manager.append_slots(req0, 5)
+    new_blocks = manager.allocate_slots(req0, 5)
     assert new_blocks is not None and len(new_blocks) == 0
 
     # The just completed block should have hashes with extra keys.
@@ -603,7 +606,7 @@ def test_reset_prefix_cache():
     unique_token_ids = [3] * 7
     all_token_ids = full_block_token_ids + unique_token_ids
     req0 = make_request("0", all_token_ids)
-    blocks = manager.allocate_slots(req0, 55, [])
+    blocks = manager.allocate_slots(req0, 55)
     assert [b.block_id for b in blocks] == [0, 1, 2, 3]
 
     unique_token_ids = [4] * 7
@@ -639,7 +642,7 @@ def test_uncache_blocks():
     )
 
     req0 = make_request("0", list(range(30)))
-    blocks = manager.allocate_slots(req0, 30, [])
+    blocks = manager.allocate_slots(req0, 30)
     assert [b.block_id for b in blocks] == [0, 1]
     assert len(manager.cached_block_hash_to_block) == 1
 
@@ -648,7 +651,7 @@ def test_uncache_blocks():
     # Simulate speculative tokens.
     for _ in range(5):
         req0.append_output_token_ids(8)
-    manager.append_slots(req0, 5)
+    manager.allocate_slots(req0, 5)
     assert len(manager.cached_block_hash_to_block) == 2
 
     # After sampling, assuming only 1 token is accepted.
diff --git a/tests/v1/e2e/test_cascade_attention.py b/tests/v1/e2e/test_cascade_attention.py
index 8ec9f1ba3f55e..a8079dcce5e2f 100644
--- a/tests/v1/e2e/test_cascade_attention.py
+++ b/tests/v1/e2e/test_cascade_attention.py
@@ -1,3 +1,5 @@
+# SPDX-License-Identifier: Apache-2.0
+
 from vllm import LLM, SamplingParams
 
 
diff --git a/tests/v1/engine/test_async_llm.py b/tests/v1/engine/test_async_llm.py
index 10f783b21a9ec..4b5bc9ced3733 100644
--- a/tests/v1/engine/test_async_llm.py
+++ b/tests/v1/engine/test_async_llm.py
@@ -1,3 +1,5 @@
+# SPDX-License-Identifier: Apache-2.0
+
 import asyncio
 from contextlib import ExitStack
 from typing import List, Tuple
diff --git a/tests/v1/engine/test_engine_args.py b/tests/v1/engine/test_engine_args.py
index ff38a4568ecb1..a3540582a397c 100644
--- a/tests/v1/engine/test_engine_args.py
+++ b/tests/v1/engine/test_engine_args.py
@@ -1,3 +1,5 @@
+# SPDX-License-Identifier: Apache-2.0
+
 import pytest
 
 from vllm import envs
diff --git a/tests/v1/engine/test_engine_core.py b/tests/v1/engine/test_engine_core.py
index cccfd305ac604..6a91f190118fb 100644
--- a/tests/v1/engine/test_engine_core.py
+++ b/tests/v1/engine/test_engine_core.py
@@ -1,3 +1,5 @@
+# SPDX-License-Identifier: Apache-2.0
+
 import time
 import uuid
 
@@ -144,7 +146,7 @@ def test_engine_core(monkeypatch):
 def test_engine_core_advanced_sampling(monkeypatch):
     """
     A basic end-to-end test to verify that the engine functions correctly 
-    when additional sampling parameters, such as min_tokens and 
+    when additional sampling parameters, such as top_p, min_tokens, and 
     presence_penalty, are set.
     """
     with monkeypatch.context() as m:
@@ -167,11 +169,23 @@ def test_engine_core_advanced_sampling(monkeypatch):
             stop_token_ids=[1001, 1002],
         )
         engine_core.add_request(request)
-        assert len(engine_core.scheduler.waiting) == 1
-        assert len(engine_core.scheduler.running) == 0
-        # Loop through until they are all done.
-        while len(engine_core.step().outputs) > 0:
-            pass
 
-        assert len(engine_core.scheduler.waiting) == 0
-        assert len(engine_core.scheduler.running) == 0
+        def _check_engine_state():
+            assert len(engine_core.scheduler.waiting) == 1
+            assert len(engine_core.scheduler.running) == 0
+            # Loop through until they are all done.
+            while len(engine_core.step().outputs) > 0:
+                pass
+            assert len(engine_core.scheduler.waiting) == 0
+            assert len(engine_core.scheduler.running) == 0
+
+        _check_engine_state()
+
+        # Second request.
+        request2 = make_request()
+        request2.sampling_params = SamplingParams(
+            top_p=0.99,
+            top_k=50,
+        )
+        engine_core.add_request(request2)
+        _check_engine_state()
diff --git a/tests/v1/engine/test_engine_core_client.py b/tests/v1/engine/test_engine_core_client.py
index e2c728b22d481..b2539132f4e01 100644
--- a/tests/v1/engine/test_engine_core_client.py
+++ b/tests/v1/engine/test_engine_core_client.py
@@ -1,3 +1,5 @@
+# SPDX-License-Identifier: Apache-2.0
+
 import asyncio
 import time
 import uuid
diff --git a/tests/v1/engine/test_output_processor.py b/tests/v1/engine/test_output_processor.py
index 4735c6f947537..5782a249f3627 100644
--- a/tests/v1/engine/test_output_processor.py
+++ b/tests/v1/engine/test_output_processor.py
@@ -1,3 +1,5 @@
+# SPDX-License-Identifier: Apache-2.0
+
 from typing import List
 
 import pytest
diff --git a/tests/v1/sample/test_sampler.py b/tests/v1/sample/test_sampler.py
index 5ebf72927cfd6..f7eedcb9c58d6 100644
--- a/tests/v1/sample/test_sampler.py
+++ b/tests/v1/sample/test_sampler.py
@@ -1,3 +1,5 @@
+# SPDX-License-Identifier: Apache-2.0
+
 from typing import List, Set, Tuple
 
 import numpy as np
diff --git a/tests/v1/test_stats.py b/tests/v1/test_stats.py
index 580392ac5f446..48419d8a2791f 100644
--- a/tests/v1/test_stats.py
+++ b/tests/v1/test_stats.py
@@ -1,3 +1,5 @@
+# SPDX-License-Identifier: Apache-2.0
+
 import pytest
 
 from vllm.sampling_params import SamplingParams
diff --git a/tests/v1/test_utils.py b/tests/v1/test_utils.py
index ac773b611f406..9b669ae006608 100644
--- a/tests/v1/test_utils.py
+++ b/tests/v1/test_utils.py
@@ -1,3 +1,5 @@
+# SPDX-License-Identifier: Apache-2.0
+
 from typing import List
 
 import torch
diff --git a/tests/v1/worker/test_gpu_input_batch.py b/tests/v1/worker/test_gpu_input_batch.py
index 694ce81ff6e22..5b40fbff8212e 100644
--- a/tests/v1/worker/test_gpu_input_batch.py
+++ b/tests/v1/worker/test_gpu_input_batch.py
@@ -1,3 +1,5 @@
+# SPDX-License-Identifier: Apache-2.0
+
 from typing import Dict, List, Set, Tuple
 
 import numpy as np
diff --git a/tests/vllm_test_utils/setup.py b/tests/vllm_test_utils/setup.py
index 790e891ec837d..c039431494c4e 100644
--- a/tests/vllm_test_utils/setup.py
+++ b/tests/vllm_test_utils/setup.py
@@ -1,3 +1,5 @@
+# SPDX-License-Identifier: Apache-2.0
+
 from setuptools import setup
 
 setup(
diff --git a/tests/vllm_test_utils/vllm_test_utils/__init__.py b/tests/vllm_test_utils/vllm_test_utils/__init__.py
index 6505c81546bb0..1d1219fbeffa1 100644
--- a/tests/vllm_test_utils/vllm_test_utils/__init__.py
+++ b/tests/vllm_test_utils/vllm_test_utils/__init__.py
@@ -1,3 +1,4 @@
+# SPDX-License-Identifier: Apache-2.0
 """
 vllm_utils is a package for vLLM testing utilities.
 It does not import any vLLM modules.
diff --git a/tests/vllm_test_utils/vllm_test_utils/blame.py b/tests/vllm_test_utils/vllm_test_utils/blame.py
index 1ddd3471d357b..392fd2705fb27 100644
--- a/tests/vllm_test_utils/vllm_test_utils/blame.py
+++ b/tests/vllm_test_utils/vllm_test_utils/blame.py
@@ -1,3 +1,5 @@
+# SPDX-License-Identifier: Apache-2.0
+
 import contextlib
 import dataclasses
 import sys
diff --git a/tests/vllm_test_utils/vllm_test_utils/monitor.py b/tests/vllm_test_utils/vllm_test_utils/monitor.py
index a237f53a75d18..44d45f2621054 100644
--- a/tests/vllm_test_utils/vllm_test_utils/monitor.py
+++ b/tests/vllm_test_utils/vllm_test_utils/monitor.py
@@ -1,3 +1,5 @@
+# SPDX-License-Identifier: Apache-2.0
+
 import contextlib
 import dataclasses
 import sys
diff --git a/tests/weight_loading/models.txt b/tests/weight_loading/models.txt
index 272206d4502e9..1b797074096ed 100644
--- a/tests/weight_loading/models.txt
+++ b/tests/weight_loading/models.txt
@@ -20,7 +20,7 @@ compressed-tensors, nm-testing/Meta-Llama-3-8B-FP8-compressed-tensors-test, main
 compressed-tensors, nm-testing/Phi-3-mini-128k-instruct-FP8, main
 compressed-tensors, neuralmagic/Phi-3-medium-128k-instruct-quantized.w4a16, main
 compressed-tensors, nm-testing/TinyLlama-1.1B-Chat-v1.0-actorder-group, main
-compressed-tensors, mgoin/DeepSeek-Coder-V2-Lite-Instruct-FP8, main
+#compressed-tensors, mgoin/DeepSeek-Coder-V2-Lite-Instruct-FP8, main
 compressed-tensors, nm-testing/SparseLlama-3.1-8B-gsm8k-pruned.2of4-FP8-Dynamic-testing, main, 90
 compressed-tensors, nm-testing/SparseLlama-3.1-8B-gsm8k-pruned.2of4-W8A8-testing, main, 90
 awq, casperhansen/mixtral-instruct-awq, main
diff --git a/tests/weight_loading/run_model_weight_loading_test.sh b/tests/weight_loading/run_model_weight_loading_test.sh
index 693128640e07d..8a899bc154f35 100755
--- a/tests/weight_loading/run_model_weight_loading_test.sh
+++ b/tests/weight_loading/run_model_weight_loading_test.sh
@@ -3,7 +3,7 @@ SUCCESS=0
 
 while getopts "c:" OPT; do
   case ${OPT} in
-    c ) 
+    c )
         CONFIG="$OPTARG"
         ;;
     \? )
@@ -18,9 +18,14 @@ IFS=$'\n' read -d '' -r -a MODEL_CONFIGS < "$CONFIG"
 
 for MODEL_CONFIG in "${MODEL_CONFIGS[@]}"
 do
+    if [[ $MODEL_CONFIG == \#* ]]; then
+        echo "=== SKIPPING MODEL: $MODEL_CONFIG ==="
+        continue
+    fi
+
     LOCAL_SUCCESS=0
     IFS=', ' read -r -a array <<< "$MODEL_CONFIG"
-    
+
     echo "=== RUNNING MODEL: $MODEL_CONFIG ==="
 
     export QUANTIZATION=${array[0]}
diff --git a/tests/weight_loading/test_weight_loading.py b/tests/weight_loading/test_weight_loading.py
index 7a3786456d0d6..e456bfab83d35 100644
--- a/tests/weight_loading/test_weight_loading.py
+++ b/tests/weight_loading/test_weight_loading.py
@@ -1,3 +1,5 @@
+# SPDX-License-Identifier: Apache-2.0
+
 import os
 
 import pytest
diff --git a/tests/worker/test_encoder_decoder_model_runner.py b/tests/worker/test_encoder_decoder_model_runner.py
index a6b3cb5759f2b..0ce0465a704cb 100644
--- a/tests/worker/test_encoder_decoder_model_runner.py
+++ b/tests/worker/test_encoder_decoder_model_runner.py
@@ -1,3 +1,5 @@
+# SPDX-License-Identifier: Apache-2.0
+
 import itertools
 from typing import List
 
diff --git a/tests/worker/test_model_input.py b/tests/worker/test_model_input.py
index 57f1fd47a600f..eb341fb1b2931 100644
--- a/tests/worker/test_model_input.py
+++ b/tests/worker/test_model_input.py
@@ -1,3 +1,5 @@
+# SPDX-License-Identifier: Apache-2.0
+
 import dataclasses
 from typing import List, Tuple, Type
 
diff --git a/tests/worker/test_model_runner.py b/tests/worker/test_model_runner.py
index aabe913c242e1..c32ceb4faa088 100644
--- a/tests/worker/test_model_runner.py
+++ b/tests/worker/test_model_runner.py
@@ -1,3 +1,5 @@
+# SPDX-License-Identifier: Apache-2.0
+
 from typing import List
 
 import pytest
diff --git a/tests/worker/test_profile.py b/tests/worker/test_profile.py
index 79233c75714de..22466105b8aba 100644
--- a/tests/worker/test_profile.py
+++ b/tests/worker/test_profile.py
@@ -1,3 +1,5 @@
+# SPDX-License-Identifier: Apache-2.0
+
 import torch
 
 from vllm.engine.arg_utils import EngineArgs
diff --git a/tests/worker/test_swap.py b/tests/worker/test_swap.py
index acede959f59f8..7ae0f4bb8e802 100644
--- a/tests/worker/test_swap.py
+++ b/tests/worker/test_swap.py
@@ -1,3 +1,5 @@
+# SPDX-License-Identifier: Apache-2.0
+
 import torch
 
 from vllm.engine.arg_utils import EngineArgs
diff --git a/tools/check_spdx_header.py b/tools/check_spdx_header.py
new file mode 100644
index 0000000000000..3f7fd66bf64b6
--- /dev/null
+++ b/tools/check_spdx_header.py
@@ -0,0 +1,43 @@
+# SPDX-License-Identifier: Apache-2.0
+
+import sys
+
+SPDX_HEADER = "# SPDX-License-Identifier: Apache-2.0"
+SPDX_HEADER_PREFIX = "# SPDX-License-Identifier:"
+
+
+def check_spdx_header(file_path):
+    with open(file_path, encoding='UTF-8') as file:
+        lines = file.readlines()
+        if not lines:
+            # not necessary for an empty file like __init__.py
+            return True
+        if not lines[0].strip().startswith(SPDX_HEADER_PREFIX):
+            return False
+    return True
+
+
+def add_header(file_path):
+    with open(file_path, 'r+', encoding='UTF-8') as file:
+        lines = file.readlines()
+        file.seek(0, 0)
+        file.write(SPDX_HEADER + '\n\n' + ''.join(lines))
+
+
+def main():
+    files_with_missing_header = []
+    for file_path in sys.argv[1:]:
+        if not check_spdx_header(file_path):
+            files_with_missing_header.append(file_path)
+
+    if files_with_missing_header:
+        print("The following files are missing the SPDX header:")
+        for file_path in files_with_missing_header:
+            print(f"  {file_path}")
+            add_header(file_path)
+
+    sys.exit(1 if files_with_missing_header else 0)
+
+
+if __name__ == "__main__":
+    main()
diff --git a/tools/profiler/print_layerwise_table.py b/tools/profiler/print_layerwise_table.py
index 54cd60c2bc95b..adbb7301bfc76 100644
--- a/tools/profiler/print_layerwise_table.py
+++ b/tools/profiler/print_layerwise_table.py
@@ -1,3 +1,5 @@
+# SPDX-License-Identifier: Apache-2.0
+
 import argparse
 import json
 from typing import Dict
diff --git a/tools/profiler/visualize_layerwise_profile.py b/tools/profiler/visualize_layerwise_profile.py
index cb56ebd69a8c1..c527cdbe02259 100644
--- a/tools/profiler/visualize_layerwise_profile.py
+++ b/tools/profiler/visualize_layerwise_profile.py
@@ -1,3 +1,5 @@
+# SPDX-License-Identifier: Apache-2.0
+
 import argparse
 import copy
 import json
diff --git a/tools/report_build_time_ninja.py b/tools/report_build_time_ninja.py
index 51ad2adc74fe1..33e85b9ff1a42 100644
--- a/tools/report_build_time_ninja.py
+++ b/tools/report_build_time_ninja.py
@@ -1,3 +1,5 @@
+# SPDX-License-Identifier: Apache-2.0
+
 #!/usr/bin/env python3
 # Copyright (c) 2018 The Chromium Authors. All rights reserved.
 # Use of this source code is governed by a BSD-style license that can be
@@ -274,8 +276,9 @@ def SummarizeEntries(entries, extra_step_types):
     print('    {:.1f} s weighted time ({:.1f} s elapsed time sum, {:1.1f}x '
           'parallelism)'.format(length, total_cpu_time,
                                 total_cpu_time * 1.0 / length))
-    print('    %d build steps completed, average of %1.2f/s' %
-          (len(entries), len(entries) / (length)))
+    print('    {} build steps completed, average of {:1.2f}/s'.format(
+        len(entries),
+        len(entries) / (length)))
 
 
 def main():
diff --git a/use_existing_torch.py b/use_existing_torch.py
index 319d262898fe3..a578328b0357d 100644
--- a/use_existing_torch.py
+++ b/use_existing_torch.py
@@ -1,3 +1,5 @@
+# SPDX-License-Identifier: Apache-2.0
+
 import glob
 
 requires_files = glob.glob('requirements*.txt')
diff --git a/vllm/__init__.py b/vllm/__init__.py
index 2aabe820d9a84..566c5116d5f09 100644
--- a/vllm/__init__.py
+++ b/vllm/__init__.py
@@ -1,3 +1,4 @@
+# SPDX-License-Identifier: Apache-2.0
 """vLLM: a high-throughput and memory-efficient inference engine for LLMs"""
 import os
 
diff --git a/vllm/_custom_ops.py b/vllm/_custom_ops.py
index 440bc52012ab7..bdc9a6a33df0b 100644
--- a/vllm/_custom_ops.py
+++ b/vllm/_custom_ops.py
@@ -1,3 +1,5 @@
+# SPDX-License-Identifier: Apache-2.0
+
 import contextlib
 import importlib
 from typing import TYPE_CHECKING, List, Optional, Tuple, Union
@@ -435,12 +437,39 @@ def cutlass_scaled_mm_supports_fp8(cuda_device_capability: int) -> bool:
     return torch.ops._C.cutlass_scaled_mm_supports_fp8(cuda_device_capability)
 
 
+def cutlass_scaled_mm_supports_block_fp8(cuda_device_capability: int) -> bool:
+    return torch.ops._C.cutlass_scaled_mm_supports_block_fp8(
+        cuda_device_capability)
+
+
 def cutlass_scaled_mm(a: torch.Tensor,
                       b: torch.Tensor,
                       scale_a: torch.Tensor,
                       scale_b: torch.Tensor,
                       out_dtype: torch.dtype,
                       bias: Optional[torch.Tensor] = None) -> torch.Tensor:
+    """
+    `cutlass_scaled_mm` implements a fused version of 
+        `output = torch.mm((scale_a * a), (scale_b * b)).to(out_dtype)`
+    where scale_a * a and scale_b * b are implemented using numpy-style 
+    broadcasting. 
+    
+    In order to support blockwise scaling like found in DeepSeek V3 we also 
+    support extended "group" broadcast rules. We extend the numpy-style 
+    broadcasting rules with the following rule: 
+        "if the extent of a dimension in the source shape is between 1 and 
+        corresponding extent in the target shape we repeat each element along 
+        that dimension  src_shape[dim] // target_shape[dim] times consecutively"
+    example if we have:
+          a = [[1, 2], and target_shape = (2, 4)
+               [3, 4]]
+    then we would expand a to:
+          a = [[1, 1, 2, 2],
+               [3, 3, 4, 4]]
+    currently we only support the case:
+        scale_a.shape * [1, 128] == a.shape
+        scale_b.shape * [128, 128] == b.shape
+    """
     assert (b.shape[0] % 16 == 0 and b.shape[1] % 16 == 0)
     assert (out_dtype is torch.bfloat16 or out_dtype is torch.float16)
     assert bias is None or bias.shape[0] == b.shape[
@@ -820,8 +849,8 @@ def scaled_int8_quant(
     if scale is not None:
         # static-per-tensor quantization.
         assert symmetric == (
-            azp is
-            None), "azp must only be provided for asymmetric quantization."
+            azp
+            is None), "azp must only be provided for asymmetric quantization."
         torch.ops._C.static_scaled_int8_quant(output, input, scale, azp)
         return output, scale, azp
 
@@ -923,6 +952,15 @@ def moe_align_block_size(topk_ids: torch.Tensor, num_experts: int,
                                           num_tokens_post_pad)
 
 
+def sgl_moe_align_block_size(topk_ids: torch.Tensor, num_experts: int,
+                             block_size: int, sorted_token_ids: torch.Tensor,
+                             experts_ids: torch.Tensor,
+                             num_tokens_post_pad: torch.Tensor) -> None:
+    torch.ops._moe_C.sgl_moe_align_block_size(topk_ids, num_experts,
+                                              block_size, sorted_token_ids,
+                                              experts_ids, num_tokens_post_pad)
+
+
 def topk_softmax(topk_weights: torch.Tensor, topk_ids: torch.Tensor,
                  token_expert_indicies: torch.Tensor,
                  gating_output: float) -> None:
@@ -980,6 +1018,19 @@ def reshape_and_cache_flash(
                                                    v_scale)
 
 
+def concat_and_cache_mla(
+    kv_c: torch.Tensor,
+    k_pe: torch.Tensor,
+    kv_cache: torch.Tensor,
+    slot_mapping: torch.Tensor,
+    kv_cache_dtype: str,
+    scale: torch.Tensor,
+) -> None:
+    torch.ops._C_cache_ops.concat_and_cache_mla(kv_c, k_pe, kv_cache,
+                                                slot_mapping, kv_cache_dtype,
+                                                scale)
+
+
 def copy_blocks(key_caches: List[torch.Tensor],
                 value_caches: List[torch.Tensor],
                 block_mapping: torch.Tensor) -> None:
diff --git a/vllm/_ipex_ops.py b/vllm/_ipex_ops.py
index 28b804f765a3a..ccb67baa53383 100644
--- a/vllm/_ipex_ops.py
+++ b/vllm/_ipex_ops.py
@@ -1,3 +1,5 @@
+# SPDX-License-Identifier: Apache-2.0
+
 from typing import List, Optional, Tuple
 
 import torch
diff --git a/vllm/adapter_commons/layers.py b/vllm/adapter_commons/layers.py
index 3ed60678b52f5..18e0c5227d45c 100644
--- a/vllm/adapter_commons/layers.py
+++ b/vllm/adapter_commons/layers.py
@@ -1,3 +1,5 @@
+# SPDX-License-Identifier: Apache-2.0
+
 from dataclasses import dataclass
 from typing import Tuple
 
diff --git a/vllm/adapter_commons/models.py b/vllm/adapter_commons/models.py
index 468904c90fff4..f9a5d2fffad5e 100644
--- a/vllm/adapter_commons/models.py
+++ b/vllm/adapter_commons/models.py
@@ -1,3 +1,5 @@
+# SPDX-License-Identifier: Apache-2.0
+
 from abc import ABC, abstractmethod
 from typing import Any, Callable, Dict, Optional, TypeVar
 
diff --git a/vllm/adapter_commons/request.py b/vllm/adapter_commons/request.py
index 2bb17fdc01109..2b604b91bbb6b 100644
--- a/vllm/adapter_commons/request.py
+++ b/vllm/adapter_commons/request.py
@@ -1,3 +1,5 @@
+# SPDX-License-Identifier: Apache-2.0
+
 from abc import ABC, abstractmethod
 
 
diff --git a/vllm/adapter_commons/utils.py b/vllm/adapter_commons/utils.py
index 1e9adca50093b..c2dc5433cc656 100644
--- a/vllm/adapter_commons/utils.py
+++ b/vllm/adapter_commons/utils.py
@@ -1,3 +1,5 @@
+# SPDX-License-Identifier: Apache-2.0
+
 from typing import Any, Callable, Dict, Optional, Set
 
 
diff --git a/vllm/adapter_commons/worker_manager.py b/vllm/adapter_commons/worker_manager.py
index 83929e82ebf04..ce24e08a5b56e 100644
--- a/vllm/adapter_commons/worker_manager.py
+++ b/vllm/adapter_commons/worker_manager.py
@@ -1,3 +1,5 @@
+# SPDX-License-Identifier: Apache-2.0
+
 from abc import ABC, abstractmethod
 from typing import Any, Optional, Set
 
diff --git a/vllm/assets/audio.py b/vllm/assets/audio.py
index a46c67ad7e00e..d9e51082e6ca2 100644
--- a/vllm/assets/audio.py
+++ b/vllm/assets/audio.py
@@ -1,3 +1,5 @@
+# SPDX-License-Identifier: Apache-2.0
+
 from dataclasses import dataclass
 from typing import Literal
 from urllib.parse import urljoin
diff --git a/vllm/assets/base.py b/vllm/assets/base.py
index 249173141106c..03f3b9dabf143 100644
--- a/vllm/assets/base.py
+++ b/vllm/assets/base.py
@@ -1,3 +1,5 @@
+# SPDX-License-Identifier: Apache-2.0
+
 from functools import lru_cache
 from pathlib import Path
 from typing import Optional
diff --git a/vllm/assets/image.py b/vllm/assets/image.py
index 0a55506f88255..2b1d258da9c78 100644
--- a/vllm/assets/image.py
+++ b/vllm/assets/image.py
@@ -1,3 +1,5 @@
+# SPDX-License-Identifier: Apache-2.0
+
 from dataclasses import dataclass
 from typing import Literal
 
diff --git a/vllm/assets/video.py b/vllm/assets/video.py
index eca2ccc54482c..494cfc38381cf 100644
--- a/vllm/assets/video.py
+++ b/vllm/assets/video.py
@@ -1,3 +1,5 @@
+# SPDX-License-Identifier: Apache-2.0
+
 from dataclasses import dataclass
 from functools import lru_cache
 from typing import List, Literal
diff --git a/vllm/attention/__init__.py b/vllm/attention/__init__.py
index 2cd4ad3e00135..85c5715faba7f 100644
--- a/vllm/attention/__init__.py
+++ b/vllm/attention/__init__.py
@@ -1,3 +1,5 @@
+# SPDX-License-Identifier: Apache-2.0
+
 from vllm.attention.backends.abstract import (AttentionBackend,
                                               AttentionMetadata,
                                               AttentionMetadataBuilder,
diff --git a/vllm/attention/backends/abstract.py b/vllm/attention/backends/abstract.py
index 8027a52b82ffc..5f0a540135402 100644
--- a/vllm/attention/backends/abstract.py
+++ b/vllm/attention/backends/abstract.py
@@ -1,3 +1,5 @@
+# SPDX-License-Identifier: Apache-2.0
+
 from abc import ABC, abstractmethod
 from contextlib import contextmanager
 from dataclasses import dataclass, fields
@@ -276,3 +278,19 @@ class AttentionImpl(ABC, Generic[T]):
         output: Optional[torch.Tensor] = None,
     ) -> torch.Tensor:
         raise NotImplementedError
+
+
+class MLAAttentionImpl(AttentionImpl[T], Generic[T]):
+
+    @abstractmethod
+    def forward(
+        self,
+        layer: AttentionLayer,
+        hidden_states_or_cq: torch.Tensor,
+        kv_c_normed: torch.Tensor,
+        k_pe: torch.Tensor,
+        kv_cache: torch.Tensor,
+        attn_metadata: T,
+        output: Optional[torch.Tensor] = None,
+    ) -> torch.Tensor:
+        raise NotImplementedError
diff --git a/vllm/attention/backends/blocksparse_attn.py b/vllm/attention/backends/blocksparse_attn.py
index 20e9a3f139de2..9765e7881ad9d 100644
--- a/vllm/attention/backends/blocksparse_attn.py
+++ b/vllm/attention/backends/blocksparse_attn.py
@@ -1,3 +1,5 @@
+# SPDX-License-Identifier: Apache-2.0
+
 from dataclasses import dataclass, field
 from typing import Any, Dict, List, Optional, Tuple, Type
 
diff --git a/vllm/attention/backends/flash_attn.py b/vllm/attention/backends/flash_attn.py
old mode 100644
new mode 100755
index 1be099283e472..6a82127acdf7a
--- a/vllm/attention/backends/flash_attn.py
+++ b/vllm/attention/backends/flash_attn.py
@@ -1,3 +1,4 @@
+# SPDX-License-Identifier: Apache-2.0
 """Attention layer with FlashAttention."""
 from collections import defaultdict
 from dataclasses import dataclass
@@ -18,17 +19,20 @@ from vllm.attention.backends.utils import (
     get_seq_len_block_table_args, is_all_cross_attn_metadata_set,
     is_all_encoder_attn_metadata_set, is_block_tables_empty)
 from vllm.envs import VLLM_FLASH_ATTN_VERSION
+from vllm.logger import init_logger
 from vllm.multimodal import MultiModalPlaceholderMap
 from vllm.platforms import current_platform
 from vllm.utils import async_tensor_h2d, make_tensor_with_pad
+from vllm.vllm_flash_attn import (fa_version_unsupported_reason,
+                                  flash_attn_varlen_func,
+                                  flash_attn_with_kvcache,
+                                  is_fa_version_supported)
 
 if TYPE_CHECKING:
     from vllm.worker.model_runner import (ModelInputForGPUBuilder,
                                           ModelInputForGPUWithSamplingMetadata)
 
-from vllm.vllm_flash_attn import (flash_attn_varlen_func,
-                                  flash_attn_with_kvcache,
-                                  is_fa_version_supported)
+logger = init_logger(__name__)
 
 
 class FlashAttentionBackend(AttentionBackend):
@@ -652,6 +656,11 @@ class FlashAttentionImpl(AttentionImpl):
             assert VLLM_FLASH_ATTN_VERSION in [2, 3]
             self.fa_version = VLLM_FLASH_ATTN_VERSION
 
+        if not is_fa_version_supported(self.fa_version):
+            logger.error("Cannot use FA version %d is not supported due to %s",
+                         self.fa_version,
+                         fa_version_unsupported_reason(self.fa_version))
+
         assert is_fa_version_supported(self.fa_version)
 
     def forward(
diff --git a/vllm/attention/backends/flashinfer.py b/vllm/attention/backends/flashinfer.py
index 3135b0b405343..715ed6748b846 100644
--- a/vllm/attention/backends/flashinfer.py
+++ b/vllm/attention/backends/flashinfer.py
@@ -1,3 +1,6 @@
+# SPDX-License-Identifier: Apache-2.0
+
+import dataclasses
 from collections import defaultdict
 from contextlib import contextmanager
 from dataclasses import dataclass
@@ -13,9 +16,11 @@ try:
     from vllm.vllm_flash_attn import flash_attn_varlen_func
     FLASHINFER_WORKSPACE_BUFFER_SIZE = 256 * 1024 * 1024
 except ImportError:
-    BatchDecodeWithPagedKVCacheWrapper = None
-    CUDAGraphBatchDecodeWithPagedKVCacheWrapper = None
-    BatchPrefillWithPagedKVCacheWrapper = None
+    # Avoid turning these types into variables during type checking
+    if not TYPE_CHECKING:
+        BatchDecodeWithPagedKVCacheWrapper = None
+        CUDAGraphBatchDecodeWithPagedKVCacheWrapper = None
+        BatchPrefillWithPagedKVCacheWrapper = None
     FLASHINFER_WORKSPACE_BUFFER_SIZE = 0
 
 import torch
@@ -30,7 +35,9 @@ from vllm.attention.backends.abstract import (AttentionBackend, AttentionImpl,
 from vllm.attention.backends.utils import (PAD_SLOT_ID, compute_slot_mapping,
                                            compute_slot_mapping_start_idx,
                                            is_block_tables_empty)
+from vllm.attention.layer import Attention
 from vllm.attention.ops.paged_attn import PagedAttention
+from vllm.config import VllmConfig, get_current_vllm_config
 from vllm.utils import (async_tensor_h2d, get_kv_cache_torch_dtype,
                         make_tensor_with_pad)
 
@@ -99,6 +106,72 @@ class FlashInferBackend(AttentionBackend):
             raise ValueError(f"Unrecognized FP8 dtype: {kv_cache_dtype}")
 
 
+@dataclass
+class PerLayerParameters:
+    """
+    Currently, FlashInfer backend only support models in which all layers share
+    the same values for the following hyperparameters.
+    """
+
+    window_left: int
+    logits_soft_cap: Optional[float]
+    sm_scale: float
+
+
+def get_per_layer_parameters(
+        vllm_config: VllmConfig) -> Dict[str, PerLayerParameters]:
+    """
+    Scan all attention layers and determine some hyperparameters
+    to use during `plan`.
+    """
+
+    layers = vllm_config.compilation_config.static_forward_context
+    per_layer_params: Dict[str, PerLayerParameters] = {}
+
+    for key, layer in layers.items():
+        assert isinstance(layer, Attention)
+
+        impl = layer.impl
+        assert isinstance(impl, FlashInferImpl)
+
+        # Infer hyperparameters from the attention layer
+        window_size = impl.sliding_window
+        window_left = window_size[0] if window_size is not None else -1
+        logits_soft_cap = impl.logits_soft_cap
+        sm_scale = impl.scale
+
+        per_layer_params[key] = PerLayerParameters(window_left,
+                                                   logits_soft_cap, sm_scale)
+
+    return per_layer_params
+
+
+def infer_global_hyperparameters(
+        per_layer_params: Dict[str, PerLayerParameters]) -> PerLayerParameters:
+    """
+    Currently, FlashInfer backend only support models in which all layers share
+    the same values for the following hyperparameters:
+    - `window_left`
+    - `logits_soft_cap`
+    - `sm_scale`
+
+    So this function asserts that all layers share the same values for these
+    hyperparameters and returns the global values.
+    """
+
+    assert len(per_layer_params) > 0, "No attention layers found in the model."
+
+    param_sets = list(per_layer_params.values())
+    global_params = param_sets[0]
+    for params in param_sets:
+        assert params == global_params, (
+            "FlashInfer backend currently only supports models in which all "
+            "layers share the same values for the following hyperparameters: "
+            "`window_left`, `logits_soft_cap`, `sm_scale`.")
+
+    return global_params
+
+
 class FlashInferState(AttentionState):
 
     def __init__(self, runner):
@@ -108,6 +181,11 @@ class FlashInferState(AttentionState):
         self._decode_wrapper = None
         self._prefill_wrapper = None
 
+        # Global hyperparameters shared by all attention layers
+        self.global_hyperparameters: Optional[PerLayerParameters] = None
+
+        self.vllm_config = get_current_vllm_config()
+
     def _get_workspace_buffer(self):
         if self._workspace_buffer is None:
             self._workspace_buffer = torch.empty(
@@ -215,6 +293,9 @@ class FlashInferState(AttentionState):
                                             batch_size + 1,
                                             dtype=torch.int32)
 
+        global_params = infer_global_hyperparameters(
+            get_per_layer_parameters(self.vllm_config))
+
         attn_metadata = self.runner.attn_backend.make_metadata(
             num_prefills=0,
             slot_mapping=self._graph_slot_mapping[:batch_size],
@@ -238,7 +319,9 @@ class FlashInferState(AttentionState):
             q_data_type=self.runner.model_config.dtype,
             use_cuda_graph=True,
             decode_wrapper=self._graph_decode_wrapper,
-            prefill_wrapper=None)
+            prefill_wrapper=None,
+            **dataclasses.asdict(global_params),
+        )
         attn_metadata.begin_forward()
         return attn_metadata
 
@@ -325,9 +408,28 @@ class FlashInferMetadata(AttentionMetadata):
     data_type: torch.dtype = None
     # The data type of the query
     q_data_type: torch.dtype = None
-    device: torch.device = torch.device("cuda")
+    # FlashInfer 0.2 encourages passing host tensors
+    device: torch.device = torch.device("cpu")
     is_profile_run: bool = False
 
+    # The FlashInfer backend currently supports only models in which all layers
+    # share the same following hyperparameters:
+
+    # The left (inclusive) window size for the attention window, when
+    # set to `-1`, the window size will be set to the full length of
+    # the sequence. Defaults to `-1`.
+    window_left: int = -1
+    # The attention logits soft capping value (used in Gemini, Grok and
+    # Gemma-2, etc.), if not provided, will be set to `0`. If greater
+    # than 0, the logits will be capped according to formula:
+    # $$\texttt{logits\_soft\_cap} \times
+    # \mathrm{tanh}(x / \texttt{logits\_soft\_cap})$$,
+    # where $x$ is the input logits.
+    logits_soft_cap: Optional[float] = None
+    # The scale used in softmax, if not provided, will be set to
+    # `1.0 / sqrt(head_dim)`.
+    sm_scale: Optional[float] = None
+
     def __post_init__(self):
         # Refer to
         # https://github.com/flashinfer-ai/flashinfer/blob/3d55c71a62052c590c130897d3a3db49b14fcc34/include/flashinfer/utils.cuh#L157
@@ -363,14 +465,21 @@ class FlashInferMetadata(AttentionMetadata):
                 self.block_table_bound = self.block_table_bound.to(self.device)
                 self.seq_lens_tensor = self.seq_lens_tensor.to(self.device)
                 self.paged_kv_indices = self.paged_kv_indices.to(self.device)
-                self.prefill_wrapper.end_forward()
-                self.prefill_wrapper.begin_forward(
+                self.prefill_wrapper.plan(
                     self.query_start_loc,
                     self.paged_kv_indptr[:self.num_prefills + 1],
                     self.paged_kv_indices,
                     self.paged_kv_last_page_len[:self.num_prefills],
-                    self.num_qo_heads, self.num_kv_heads, self.head_dim,
-                    self.page_size)
+                    self.num_qo_heads,
+                    self.num_kv_heads,
+                    self.head_dim,
+                    self.page_size,
+                    causal=True,
+                    sm_scale=self.sm_scale,
+                    window_left=self.window_left,
+                    logits_soft_cap=self.logits_soft_cap,
+                    q_data_type=self.q_data_type,
+                    kv_data_type=self.data_type)
         if self.num_decode_tokens > 0:
             assert self.paged_kv_indices is not None
             assert self.paged_kv_indptr is not None
@@ -386,8 +495,7 @@ class FlashInferMetadata(AttentionMetadata):
                 self.seq_lens_tensor = self.seq_lens_tensor.to(self.device)
 
             assert self.decode_wrapper is not None
-            self.decode_wrapper.end_forward()
-            self.decode_wrapper.begin_forward(
+            self.decode_wrapper.plan(
                 self.paged_kv_indptr[self.num_prefills:],
                 self.paged_kv_indices,
                 self.paged_kv_last_page_len[self.num_prefills:],
@@ -397,8 +505,11 @@ class FlashInferMetadata(AttentionMetadata):
                 self.page_size,
                 # Disable flashinfer's pos encoding and use vllm's rope.
                 pos_encoding_mode="NONE",
+                window_left=self.window_left,
+                logits_soft_cap=self.logits_soft_cap,
+                sm_scale=self.sm_scale,
                 # kv-cache data type.
-                data_type=self.data_type,
+                kv_data_type=self.data_type,
                 # query data type.
                 q_data_type=self.q_data_type)
 
@@ -496,6 +607,11 @@ class FlashInferMetadataBuilder(AttentionMetadataBuilder[FlashInferMetadata]):
         self.sliding_window = input_builder.sliding_window
         self.block_size = input_builder.block_size
 
+        # Global hyperparameters shared by all attention layers
+        self.global_hyperparameters: Optional[PerLayerParameters] = None
+
+        self.vllm_config = get_current_vllm_config()
+
     def prepare(self):
         self.slot_mapping: List[int] = []
         self.prefill_seq_lens: List[int] = []
@@ -528,6 +644,20 @@ class FlashInferMetadataBuilder(AttentionMetadataBuilder[FlashInferMetadata]):
         self.total_blocks = 0
         self.is_profile_run: bool = False
 
+        if self.global_hyperparameters is None:
+            # Infer global hyperparameters, since currently we only support
+            # models in which all layers share the same values for the
+            # following hyperparameters:
+            # - `window_left`
+            # - `logits_soft_cap`
+            # - `sm_scale`
+            inferred_params = infer_global_hyperparameters(
+                get_per_layer_parameters(self.vllm_config))
+            self.global_hyperparameters = inferred_params
+            self.window_left = inferred_params.window_left
+            self.logits_soft_cap = inferred_params.logits_soft_cap
+            self.sm_scale = inferred_params.sm_scale
+
     def _add_seq_group(
             self, inter_data: "ModelInputForGPUBuilder.InterDataForSeqGroup",
             chunked_prefill_enabled: bool):
@@ -756,7 +886,11 @@ class FlashInferMetadataBuilder(AttentionMetadataBuilder[FlashInferMetadata]):
             data_type=kv_cache_dtype,
             q_data_type=self.runner.model_config.dtype,
             use_cuda_graph=use_captured_graph,
-            is_profile_run=self.is_profile_run)
+            is_profile_run=self.is_profile_run,
+            window_left=self.window_left,
+            logits_soft_cap=self.logits_soft_cap,
+            sm_scale=self.sm_scale,
+        )
 
 
 class FlashInferImpl(AttentionImpl):
@@ -885,25 +1019,34 @@ class FlashInferImpl(AttentionImpl):
             else:
                 assert prefill_meta is not None
                 assert prefill_meta.prefill_wrapper is not None
-                prefill_output = prefill_meta.prefill_wrapper.forward(
+
+                assert prefill_meta.prefill_wrapper._causal
+                assert prefill_meta.prefill_wrapper._window_left == window_left
+                assert prefill_meta.prefill_wrapper._logits_soft_cap == (
+                    logits_soft_cap or 0.0)
+                assert prefill_meta.prefill_wrapper._sm_scale == softmax_scale
+
+                prefill_output = prefill_meta.prefill_wrapper.run(
                     query,
                     kv_cache,
-                    logits_soft_cap=logits_soft_cap,
-                    causal=True,
                     k_scale=layer._k_scale_float,
                     v_scale=layer._v_scale_float,
-                    window_left=window_left)
+                )
         if decode_meta := attn_metadata.decode_metadata:
             assert decode_meta is not None
             assert decode_meta.decode_wrapper is not None
-            decode_output = decode_meta.decode_wrapper.forward(
+
+            assert decode_meta.decode_wrapper._window_left == window_left
+            assert decode_meta.decode_wrapper._logits_soft_cap == (
+                logits_soft_cap or 0.0)
+            assert decode_meta.decode_wrapper._sm_scale == softmax_scale
+
+            decode_output = decode_meta.decode_wrapper.run(
                 decode_query,
                 kv_cache,
-                sm_scale=softmax_scale,
-                logits_soft_cap=logits_soft_cap,
                 k_scale=layer._k_scale_float,
                 v_scale=layer._v_scale_float,
-                window_left=window_left)
+            )
 
         if prefill_output is None and decode_output is not None:
             # Decode only batch.
diff --git a/vllm/attention/backends/hpu_attn.py b/vllm/attention/backends/hpu_attn.py
index 80c132c0a8c05..1518e518e91b4 100644
--- a/vllm/attention/backends/hpu_attn.py
+++ b/vllm/attention/backends/hpu_attn.py
@@ -1,3 +1,5 @@
+# SPDX-License-Identifier: Apache-2.0
+
 ###############################################################################
 # Copyright (C) 2024 Habana Labs, Ltd. an Intel Company
 ###############################################################################
diff --git a/vllm/attention/backends/ipex_attn.py b/vllm/attention/backends/ipex_attn.py
index 57916a3c6a34c..b4879af4cf20e 100644
--- a/vllm/attention/backends/ipex_attn.py
+++ b/vllm/attention/backends/ipex_attn.py
@@ -1,3 +1,4 @@
+# SPDX-License-Identifier: Apache-2.0
 """ Attention layer with torch scaled_dot_product_attention
     and PagedAttention."""
 from dataclasses import dataclass
diff --git a/vllm/attention/backends/mla/__init__.py b/vllm/attention/backends/mla/__init__.py
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/vllm/attention/backends/mla/utils.py b/vllm/attention/backends/mla/utils.py
new file mode 100644
index 0000000000000..9b63192ed0f6d
--- /dev/null
+++ b/vllm/attention/backends/mla/utils.py
@@ -0,0 +1,515 @@
+# SPDX-License-Identifier: Apache-2.0
+
+from abc import abstractmethod
+from dataclasses import dataclass
+from typing import Any, Dict, Generic, List, Optional, Tuple
+
+import torch
+from compressed_tensors.quantization import QuantizationStrategy
+
+from vllm import _custom_ops as ops
+from vllm import envs
+from vllm.attention.backends.abstract import (AttentionLayer,
+                                              AttentionMetadata,
+                                              MLAAttentionImpl, T)
+from vllm.distributed import (get_tensor_model_parallel_world_size,
+                              tensor_model_parallel_all_reduce)
+from vllm.model_executor.layers.linear import (ColumnParallelLinear,
+                                               LinearBase, RowParallelLinear,
+                                               UnquantizedLinearMethod)
+from vllm.model_executor.layers.quantization.compressed_tensors.compressed_tensors import (  # noqa: E501
+    CompressedTensorsLinearMethod)
+from vllm.model_executor.layers.quantization.compressed_tensors.schemes import (
+    CompressedTensorsW8A8Fp8)
+from vllm.model_executor.layers.quantization.fp8 import Fp8LinearMethod
+from vllm.model_executor.layers.quantization.utils.fp8_utils import (
+    apply_fp8_linear_generic, current_platform_fp8_dtype, is_fp8)
+from vllm.model_executor.layers.quantization.utils.quant_utils import (
+    scaled_dequantize, scaled_quantize)
+from vllm.model_executor.layers.rotary_embedding import RotaryEmbedding
+from vllm.vllm_flash_attn import flash_attn_varlen_func
+
+
+@dataclass
+class MLACommonMetadata(AttentionMetadata):
+    # Input positions for rotrary embeddings since for MLA the rotary
+    # position embeddings are applied inside the attention backend
+    input_positions: torch.Tensor
+
+
+class MLACommonImpl(MLAAttentionImpl[T], Generic[T]):
+    """
+    Common class for implementing repeated parts
+
+    Main reference: DeepseekV2 paper, and FlashInfer Implementation
+    (https://arxiv.org/abs/2405.04434 and https://github.com/flashinfer-ai/flashinfer/pull/551).
+
+    Deepseek's MLA attention works the following way:
+    * Use a single latent vector to represent the entire KV cache.
+    * The attention "simulates" a multi-head attention, while the compute is
+      similar to multi-query attention.
+    * The dataflow is as follows,
+
+        * B: batch/sequence length
+        * H: hidden size
+        * N: number of attention heads
+        * Lq: latent dimension for Q
+        * Lkv: latent dimension for K/V
+        * P: nope dimension, P+R is the actual head_dim in common attention.
+        * R: rope dimension, this slide of the head_dim goes through rope.
+        * V: V head dim.
+        * kv_c: latent/compressed KV
+        * q_c: latent/compressed Q
+
+        #
+        # Outside the MLA attention backend
+        #
+
+        1. The hidden states (B, H) are projected down into cq (B, Lq) and
+           kv_c_k_pe (B, Lkv+R).
+        2. The kv_c_k_pe is split into kv_c (B, Lkv) and k_pe (B, R). cq
+           and kv_c are normalized.
+
+        #
+        # Inside the MLA attention backend
+        #
+
+        * if prefill:
+
+        3. The q_c is then projected up into the multi-head version.
+           * q_c goes from (B, Lq) to (B, N, (P+R)), which is split into q_nope
+             (B, N, P) and q_pe (B, N, R).
+        4. q_pe, k_pe are then passed through rotary embeddings.
+        5. kv_c and k_pe are concatenated and inserted into the cache
+        6. The kv_c is then projected up into the multi-head version.
+           * kv_c goes from (B, Lkv) to (B, N, (P+V)) which has the nope
+             dimensions for K and V, which is split into k_nope (B, N, P)
+             and v (B, N, V).
+        7. q (B, N, (P+R)) and k (B, N, (P+R)) matrices are assembled from
+           q_nope, q_pe, k_nope, k_pe.
+        8. Attention is computued with q, k, v.
+        9. The attention computation returns (B, N, V), which is projected back
+           to (B, H) using out projection.
+
+        * if decode:
+
+        3. Here's the change, we do not perform up the full up projection for
+           q_c, and there is no up projection at all for kv_c. This is
+           achieved by the technique of "weight absorption". The paper says
+           "Fortunately, due to the associative law of matrix multiplication,
+           we can absorb WUK into WUQ, and WUV into WO"
+           * The q up projection turns (B, Lq) into (B, N, (P+R)), we split it
+             into W_UQ (Lq, N, P) and W_QR (Lq, N, R).
+           * The kv_c up projection turns (B, Lkv) into (B, N, (P+V)), we split
+             it into W_UK (Lkv, N, P) and W_UV (Lkv, N, V).
+           * The out projection shape W_O (N*V, H) turns (B, N, V) into (B, H).
+           * We can precompute the product of W_UQ and W_UK into
+             W_UQ_UK (Lq, N, Lkv), which is possible due to QK^T operation in
+             attention.
+           * We can precompute the product of W_UV and W_O into
+             W_UV_O (N, Lkv, H), which is possible due to V@O as the
+             "epilogue" of attention
+        4. We still need to compute q_pe (B, N, R) by applying W_QR to q_latent.
+        5. q_pe, k_pe are then passed through rotary embeddings.
+        6. kv_c and k_pe are concatenated and inserted into the cache
+        7. By applying W_UQ_UK to q_latent, we have the new q_nope of shape
+           (B, N, Lkv).
+        8. q (B, N, (Lkv+R)), k (B, (Lkv+R)) are assembled from q_nope, q_pe,
+           kv_a, k_pe. v (B, Lkv) is exactly the same vector as kv_a.
+        9. The attention is computed with q, k, v. Note that we just performed
+           a MQA attention with (LKv+R) as our head dim.
+        10. The KV cache is updated using the new entries k (B, N, (Lkv+R)),
+           which included the v and rope values.
+        11. The attention computation returns (B, N, Lkv), which is projected
+           back to (B, H) using W_UV_O.
+
+    From @tsu-bin's calculation, we only want to use the absorption technique
+    for decode. The prefill algorithm should still use the up-projected MHA
+    for less flops and memory usage.
+
+    """
+
+    def __init__(
+        self,
+        num_heads: int,
+        head_size: int,
+        scale: float,
+        num_kv_heads: int,
+        alibi_slopes: Optional[List[float]],
+        sliding_window: Optional[int],
+        kv_cache_dtype: str,
+        blocksparse_params: Optional[Dict[str, Any]],
+        logits_soft_cap: Optional[float],
+        attn_type: str,
+        # MLA Specific Arguments
+        q_lora_rank: Optional[int],
+        kv_lora_rank: int,
+        qk_nope_head_dim: int,
+        qk_rope_head_dim: int,
+        qk_head_dim: int,
+        v_head_dim: int,
+        rotary_emb: RotaryEmbedding,
+        # q_proj should be q_b_proj if q_lora_rank is not None, but from an
+        # attention backend perspective we rely on the layer to pass in the
+        # correct matrix
+        q_proj: ColumnParallelLinear,
+        kv_b_proj: ColumnParallelLinear,
+        o_proj: RowParallelLinear,
+    ) -> None:
+        self.num_heads = num_heads
+        self.head_size = head_size
+        self.scale = float(scale)
+        self.num_kv_heads = num_kv_heads
+        self.kv_cache_dtype = kv_cache_dtype
+
+        self.q_lora_rank = q_lora_rank
+        self.kv_lora_rank = kv_lora_rank
+        self.qk_nope_head_dim = qk_nope_head_dim
+        self.qk_rope_head_dim = qk_rope_head_dim
+        self.qk_head_dim = qk_head_dim
+        self.v_head_dim = v_head_dim
+
+        self.rotary_emb = rotary_emb
+        self.q_proj = q_proj
+        self.kv_b_proj = kv_b_proj
+        self.o_proj = o_proj
+
+    def _v_up_proj_and_o_proj(self, x):
+        if envs.VLLM_MLA_PERFORM_MATRIX_ABSORPTION:
+            if is_fp8(self.W_UV_O):
+                output_parallel = apply_fp8_linear_generic(
+                    x.flatten(start_dim=1), self.W_UV_O, self.W_UV_O_scales,
+                    self.reqaunt_input_group_shape,
+                    self.reqaunt_weight_group_shape)
+            else:
+                output_parallel = torch.matmul(x.flatten(start_dim=1),
+                                               self.W_UV_O)
+            if self.tp_size > 1:
+                output = tensor_model_parallel_all_reduce(output_parallel)
+            else:
+                output = output_parallel
+            return output
+        else:
+            x = torch.einsum("bnl,lnv->bnv", x, self.W_UV)
+            return self.o_proj(x.reshape(-1,
+                                         self.num_heads * self.v_head_dim))[0]
+
+    def _q_proj_and_k_up_proj(self, x):
+        if envs.VLLM_MLA_PERFORM_MATRIX_ABSORPTION:
+            if is_fp8(self.W_Q_UK):
+                return apply_fp8_linear_generic(
+                    x, self.W_Q_UK, self.W_Q_UK_scales,
+                    self.reqaunt_input_group_shape,
+                    self.reqaunt_weight_group_shape).view(
+                        -1, self.num_heads, self.kv_lora_rank)
+            return torch.matmul(x, self.W_Q_UK)\
+                .view(-1, self.num_heads, self.kv_lora_rank)
+        else:
+            x = torch.matmul(x, self.W_Q)\
+                .view(-1, self.num_heads, self.qk_nope_head_dim)
+            return torch.einsum("bnp,lnp->bnl", x, self.W_UK)\
+                .view(-1, self.num_heads, self.kv_lora_rank)
+
+    def process_weights_after_loading(self, act_dtype: torch.dtype):
+
+        def is_layer_fp8(layer: LinearBase) -> bool:
+            return isinstance(layer.quant_method, Fp8LinearMethod) or\
+                (isinstance(layer.quant_method, CompressedTensorsLinearMethod)\
+                and isinstance(layer.scheme, CompressedTensorsW8A8Fp8))
+
+        def quantization_scheme_supported(layer: LinearBase) -> bool:
+            return isinstance(layer.quant_method, UnquantizedLinearMethod) or \
+                is_layer_fp8(layer)
+
+        # TODO(lucas) This is very gross, we need a more wide scale refactor of
+        # all the FP8 code with a more standard way of
+        # defining schemes/group-shapes, we should also potentially force
+        # quant_methods to support a decompress function
+        #
+        # returns input_group_shape, weight_group_shape
+        def get_scale_group_shapes_for_fp8(layer: LinearBase) -> \
+            Tuple[Tuple[int, int], Tuple[int, int]]:
+            if isinstance(layer.quant_method, Fp8LinearMethod):
+                if layer.quant_method.block_quant is not None:
+                    weight_block_size = \
+                        layer.quant_method.quant_config.weight_block_size
+                    # per-token-group (1, X), block-quantized (X, Y)
+                    return (1, weight_block_size[-1]), weight_block_size
+                else:
+                    return (-1, -1), (-1, -1)  # per-tensor, per-tensor
+            elif isinstance(layer.quant_method, CompressedTensorsLinearMethod)\
+                and isinstance(layer.scheme, CompressedTensorsW8A8Fp8):
+                # this is hacky but we always assume the for
+                # CompressedTensorsW8A8Fp8 the input is dynamic per-token
+                # we ignore if it is static-per-tensor since we are going to
+                # requantize after later anyways
+                strategy = layer.scheme.strategy
+                if strategy == QuantizationStrategy.TENSOR:
+                    return (1, -1), (-1, -1)  # per-token, per-tensor
+                elif strategy == QuantizationStrategy.CHANNEL:
+                    return (1, -1), (-1, 1)  # per-token, per-channel
+                else:
+                    raise NotImplementedError(
+                        f"QuantizationStrategy.{strategy} is not supported for "
+                        "fp8 MLA, please run with VLLM_MLA_DISABLE=1")
+            else:
+                raise NotImplementedError(
+                    "Can't determine scale group shapes for "
+                    f"{layer.quant_method}, please run with VLLM_MLA_DISABLE=1"
+                )
+
+        def get_scales(layer: LinearBase) -> torch.Tensor:
+            if hasattr(layer, "weight_scale_inv"):
+                return layer.weight_scale_inv
+            return layer.weight_scale
+
+        def get_and_maybe_dequant_weights(layer: LinearBase):
+            if is_layer_fp8(layer):
+                if isinstance(layer.quant_method, \
+                    CompressedTensorsLinearMethod) and \
+                    isinstance(layer.scheme, CompressedTensorsW8A8Fp8):
+                    # NOTE(lucas): note sure why but `CompressedTensorsW8A8Fp8`
+                    # seems to store weights as (input, output) instead of
+                    # (output, input) so we need to transpose
+                    weight = layer.weight.T  # standardize to (output, input)
+                else:
+                    weight = layer.weight
+                _, weight_scale_group_shape = \
+                    get_scale_group_shapes_for_fp8(layer)
+                scales = get_scales(layer)
+
+                return scaled_dequantize(weight, scales,
+                                         weight_scale_group_shape)
+            else:
+                return layer.weight
+
+        if not (quantization_scheme_supported(self.kv_b_proj) and\
+            quantization_scheme_supported(self.q_proj) and\
+                quantization_scheme_supported(self.o_proj)):
+            raise NotImplementedError(
+                "Only FP8 and UnquantizedLinearMethod are supported for MLA"
+                ", please run with VLLM_MLA_DISABLE=1")
+
+        weight_dtype = self.kv_b_proj.weight.dtype
+        assert self.o_proj.weight.dtype == weight_dtype
+        assert self.q_proj.weight.dtype == weight_dtype
+
+        kv_b_proj_weight = get_and_maybe_dequant_weights(self.kv_b_proj).T
+        assert kv_b_proj_weight.shape == (
+            self.kv_lora_rank,
+            self.num_heads * (self.qk_nope_head_dim + self.v_head_dim)), (
+                f"{kv_b_proj_weight.shape=}, "
+                f"{self.kv_lora_rank=}, "
+                f"{self.num_heads=}, "
+                f"{self.qk_nope_head_dim=}, "
+                f"{self.v_head_dim=}")
+        kv_b_proj_weight = kv_b_proj_weight.view(
+            self.kv_lora_rank,
+            self.num_heads,
+            self.qk_nope_head_dim + self.v_head_dim,
+        )
+
+        W_UK, W_UV = kv_b_proj_weight.split(
+            [self.qk_nope_head_dim, self.v_head_dim], dim=-1)
+
+        q_proj_weight = get_and_maybe_dequant_weights(self.q_proj).T\
+                .view(-1, self.num_heads, self.qk_head_dim)
+
+        # can be W_Q or W_UQ depending q_lora_rank, the former if
+        # q_lora_rank is None, the latter otherwise. From the Attention backend
+        # perspective though we call these both W_Q and rely on the layer
+        # to pass in the correct matrix
+        W_Q = q_proj_weight[..., :self.qk_nope_head_dim]
+        self.W_QR = q_proj_weight[..., self.qk_nope_head_dim:]\
+            .flatten(start_dim=1).contiguous()
+
+        # W_QR is small so for simplicity we dont bother requantizing it
+        self.W_QR = self.W_QR.to(act_dtype)
+
+        if envs.VLLM_MLA_PERFORM_MATRIX_ABSORPTION:
+            requantization_enabled = not envs.VLLM_MLA_DISABLE_REQUANTIZATION
+            if is_fp8(weight_dtype) and requantization_enabled:
+                # This assumes it wise to requantize using the same group shapes
+                # (i.e. strategy, per-tensor, per-channel, block etc.) that the
+                # weights were originally quantized
+                requant_input_group_shape, requant_weight_group_shape = \
+                    get_scale_group_shapes_for_fp8(self.q_proj)
+                assert (requant_input_group_shape, requant_weight_group_shape)\
+                    == get_scale_group_shapes_for_fp8(self.kv_b_proj)
+                assert (requant_input_group_shape, requant_weight_group_shape)\
+                    == get_scale_group_shapes_for_fp8(self.o_proj)
+                self.reqaunt_input_group_shape = requant_input_group_shape
+                self.reqaunt_weight_group_shape = requant_weight_group_shape
+
+            #
+            # Perform matrix-absorption following
+            #     https://github.com/flashinfer-ai/flashinfer/pull/551
+            # for decode, as a result we end up with absorbed weights for decode
+            # and another copy of raw weights for prefill.
+            #
+            self.W_UK, self.W_UV = kv_b_proj_weight.split(
+                [self.qk_nope_head_dim, self.v_head_dim], dim=-1)
+            # We absorb `W_UK` into `W_Q` resulting in either W_Q_UK or W_UQ_UK
+            # depending q_lora_rank, the former if q_lora_rank is None, the
+            # latter otherwise
+            # basically if q_lora_rank is none we are absorbing into q_proj
+            # instead of UQ
+            W_Q_UK = torch.einsum("qnd,lnd -> qnl", W_Q, W_UK)\
+                .flatten(start_dim=1).contiguous()
+
+            if is_fp8(weight_dtype) and requantization_enabled:
+                W_Q_UK, W_Q_UK_scales = scaled_quantize(
+                    W_Q_UK,
+                    self.reqaunt_weight_group_shape,
+                    quant_dtype=current_platform_fp8_dtype)
+                # For FP8 save the transpose so we can use
+                # `apply_w8a8_block_fp8_linear` directly
+                self.W_Q_UK = W_Q_UK.T.contiguous()
+                self.W_Q_UK_scales = W_Q_UK_scales.T.contiguous()
+            else:
+                self.W_Q_UK = W_Q_UK.to(act_dtype)
+
+            W_O = get_and_maybe_dequant_weights(self.o_proj)\
+                .view(-1, self.num_heads, self.v_head_dim)
+            W_UV_O = torch.einsum("lnd,hnd -> nlh", W_UV, W_O)\
+                .flatten(start_dim=0, end_dim=1).contiguous()
+
+            if is_fp8(weight_dtype) and requantization_enabled:
+                W_UV_O, W_UV_O_scales = scaled_quantize(
+                    W_UV_O,
+                    self.reqaunt_weight_group_shape,
+                    quant_dtype=current_platform_fp8_dtype)
+                # For FP8 save the transpose so we can use
+                # `apply_w8a8_block_fp8_linear` directly
+                self.W_UV_O = W_UV_O.T.contiguous()
+                self.W_UV_O_scales = W_UV_O_scales.T.contiguous()
+            else:
+                self.W_UV_O = W_UV_O.to(act_dtype)
+
+            self.tp_size = get_tensor_model_parallel_world_size()
+        else:
+            if is_fp8(weight_dtype):
+                raise NotImplementedError(
+                    "Currently fp8 requires matrix absorption")
+
+            self.W_UV = W_UV
+            self.W_UK = W_UK
+            self.W_Q = W_Q.flatten(start_dim=1)
+
+    @abstractmethod
+    def _forward_prefill(
+        self,
+        q: torch.Tensor,
+        kv_c_normed: torch.Tensor,
+        k_pe: torch.Tensor,
+        attn_metadata: T,
+    ) -> torch.Tensor:
+        raise NotImplementedError
+
+    @abstractmethod
+    def _forward_decode(
+        self,
+        q_nope: torch.Tensor,
+        q_pe: torch.Tensor,
+        kv_cache: torch.Tensor,
+        attn_metadata: T,
+    ) -> torch.Tensor:
+        raise NotImplementedError
+
+    def forward(
+        self,
+        layer: AttentionLayer,
+        hidden_states_or_q_c: torch.Tensor,  # query in unified attn
+        k_c_normed: torch.Tensor,  # key in unified attn
+        k_pe: torch.Tensor,  # value in unified attn
+        kv_cache: torch.Tensor,
+        attn_metadata: T,
+        output: Optional[torch.Tensor] = None,
+    ) -> torch.Tensor:
+        if output is not None:
+            raise NotImplementedError(
+                "output is not yet supported for MLAImplBase")
+
+        is_decode = attn_metadata.decode_metadata is not None
+        is_prefill = attn_metadata.prefill_metadata is not None
+
+        if (is_decode and is_prefill):
+            raise NotImplementedError(
+                "chunked prefill is not supported for MLAImplBase")
+
+        # Restore head dim (for rotary embedding)
+        k_pe = k_pe.unsqueeze(1)
+        assert hasattr(attn_metadata, "input_positions")
+
+        if is_decode:
+            q_nope = self._q_proj_and_k_up_proj(hidden_states_or_q_c)
+            q_pe = torch.matmul(hidden_states_or_q_c, self.W_QR)\
+                .view(-1, self.num_heads, self.qk_rope_head_dim)
+            q_pe, k_pe = \
+                self.rotary_emb(attn_metadata.input_positions, q_pe, k_pe)
+        else:
+            assert is_prefill
+            q = self.q_proj(hidden_states_or_q_c)[0]\
+                .view(-1, self.num_heads, self.qk_head_dim)
+
+            # TODO(lucas): there must be a nicer way to write this line
+            q[..., self.qk_nope_head_dim:], k_pe = \
+                self.rotary_emb(
+                    attn_metadata.input_positions,
+                    q[..., self.qk_nope_head_dim:], k_pe)
+
+        # write the latent and rope to kv cache
+        if kv_cache.numel() > 0:
+            ops.concat_and_cache_mla(
+                k_c_normed,
+                k_pe.squeeze(1),
+                kv_cache,
+                attn_metadata.slot_mapping.flatten(),
+                kv_cache_dtype=self.kv_cache_dtype,
+                scale=layer._k_scale,
+            )
+
+        if attn_metadata.prefill_metadata is not None:
+            return self._forward_prefill(q, k_c_normed, k_pe, attn_metadata)
+
+        if attn_metadata.decode_metadata is not None:
+            return self._forward_decode(q_nope, q_pe, kv_cache, attn_metadata)
+
+    # Optional common flash-attn based prefill
+    def _forward_prefill_flash(
+        self,
+        q: torch.Tensor,
+        k_c_normed: torch.Tensor,
+        k_pe: torch.Tensor,
+        seq_start_loc: torch.Tensor,
+        max_prefill_seq_len: int,
+    ) -> torch.Tensor:
+
+        kv_nope = self.kv_b_proj(k_c_normed)[0]\
+            .view(-1, self.num_heads, self.qk_nope_head_dim + self.v_head_dim)
+        k_nope, v = kv_nope\
+            .split([self.qk_nope_head_dim, self.v_head_dim], dim=-1)
+
+        k = torch.cat((k_nope, k_pe.expand((*k_nope.shape[:-1], -1))), dim=-1)
+
+        # For MLA the v head dim is smaller than qk head dim so we pad out
+        # v with 0s to match the qk head dim
+        v_padded = torch.nn.functional.pad(v, [0, q.shape[-1] - v.shape[-1]],
+                                           value=0)
+
+        attn_output = flash_attn_varlen_func(
+            q=q,
+            k=k,
+            v=v_padded,
+            cu_seqlens_q=seq_start_loc,
+            cu_seqlens_k=seq_start_loc,
+            max_seqlen_q=max_prefill_seq_len,
+            max_seqlen_k=max_prefill_seq_len,
+            softmax_scale=self.scale,
+            causal=True,
+        )
+        attn_output = attn_output\
+            .view(-1, self.num_heads, q.shape[-1])[..., :v.shape[-1]]\
+                .reshape(-1, self.num_heads * v.shape[-1])
+
+        return self.o_proj(attn_output)[0]
diff --git a/vllm/attention/backends/openvino.py b/vllm/attention/backends/openvino.py
index be06d16009988..f58528dbf5b78 100644
--- a/vllm/attention/backends/openvino.py
+++ b/vllm/attention/backends/openvino.py
@@ -1,3 +1,5 @@
+# SPDX-License-Identifier: Apache-2.0
+
 from dataclasses import dataclass
 from typing import Dict, List, Optional, Tuple, Type
 
diff --git a/vllm/attention/backends/pallas.py b/vllm/attention/backends/pallas.py
index facdee6b29e39..b61dfe63ddcaa 100644
--- a/vllm/attention/backends/pallas.py
+++ b/vllm/attention/backends/pallas.py
@@ -1,3 +1,5 @@
+# SPDX-License-Identifier: Apache-2.0
+
 from dataclasses import dataclass
 from typing import Any, Dict, List, Optional, Tuple, Type
 
@@ -110,6 +112,7 @@ class PallasAttentionBackendImpl(AttentionImpl):
 
         assert self.num_heads % self.num_kv_heads == 0
         self.num_queries_per_kv = self.num_heads // self.num_kv_heads
+        self.logits_soft_cap = logits_soft_cap
         if head_size % 128 != 0:
             raise NotImplementedError("Head size must be a multiple of 128.")
         if alibi_slopes is not None:
@@ -120,9 +123,6 @@ class PallasAttentionBackendImpl(AttentionImpl):
             raise NotImplementedError("FP8 KV cache dtype is not supported.")
         if blocksparse_params is not None:
             raise NotImplementedError("Blocksparse is not supported.")
-        if logits_soft_cap is not None:
-            raise NotImplementedError(
-                "Attention logits soft-capping is not supported.")
 
         if torch_xla.tpu.version() < 4:
             raise NotImplementedError("TPU version must be 4 or higher.")
@@ -230,6 +230,7 @@ class PallasAttentionBackendImpl(AttentionImpl):
                     num_kv_pages_per_compute_block,
                     num_queries_per_compute_block,
                     use_kernel=True,
+                    attn_logits_soft_cap=self.logits_soft_cap,
                 )
         else:
             # Decoding run.
@@ -257,6 +258,7 @@ class PallasAttentionBackendImpl(AttentionImpl):
                     attn_metadata.block_tables,
                     pages_per_compute_block,
                     self.megacore_mode,
+                    attn_logits_soft_cap=self.logits_soft_cap,
                 )
             else:
                 chunk_size = max_num_seq
@@ -280,6 +282,7 @@ class PallasAttentionBackendImpl(AttentionImpl):
                         attn_metadata.block_tables[chunk_start:chunk_end],
                         pages_per_compute_block,
                         self.megacore_mode,
+                        attn_logits_soft_cap=self.logits_soft_cap,
                     )
                     output[chunk_start:chunk_end] = chunk_output
 
@@ -313,6 +316,8 @@ def paged_attention(
     block_tables: torch.Tensor,
     pages_per_compute_block: int,
     megacore_mode: Optional[str],
+    *,
+    attn_logits_soft_cap: Optional[float],
 ) -> torch.Tensor:
     batch_size = query.shape[0]
     if megacore_mode == "batch" and batch_size % 2 != 0:
@@ -320,26 +325,13 @@ def paged_attention(
     else:
         megacore_mode = megacore_mode
 
-    # NOTE(woosuk): A temporary workaround to avoid the error:
-    # "xla::paged_attention() Expected a value of type 'str' for
-    # argument 'megacore_mode' but instead found type 'NoneType'."
-    if megacore_mode is not None:
-        output = torch.ops.xla.paged_attention(
-            query,
-            key_cache,
-            value_cache,
-            context_lens,
-            block_tables,
-            pages_per_compute_block,
-            megacore_mode=megacore_mode,
-        )
-    else:
-        output = torch.ops.xla.paged_attention(
-            query,
-            key_cache,
-            value_cache,
-            context_lens,
-            block_tables,
-            pages_per_compute_block,
-        )
-    return output
+    return torch.ops.xla.paged_attention(
+        query,
+        key_cache,
+        value_cache,
+        context_lens,
+        block_tables,
+        pages_per_compute_block,
+        megacore_mode=megacore_mode,
+        attn_logits_soft_cap=attn_logits_soft_cap,
+    )
diff --git a/vllm/attention/backends/placeholder_attn.py b/vllm/attention/backends/placeholder_attn.py
index 826311896d1d2..9f6e731afd193 100644
--- a/vllm/attention/backends/placeholder_attn.py
+++ b/vllm/attention/backends/placeholder_attn.py
@@ -1,3 +1,5 @@
+# SPDX-License-Identifier: Apache-2.0
+
 from collections import defaultdict
 from dataclasses import dataclass
 from typing import TYPE_CHECKING, Dict, List, Optional, Tuple, Type
diff --git a/vllm/attention/backends/rocm_flash_attn.py b/vllm/attention/backends/rocm_flash_attn.py
index ca6fa9ca61b30..02bff57a62b7c 100644
--- a/vllm/attention/backends/rocm_flash_attn.py
+++ b/vllm/attention/backends/rocm_flash_attn.py
@@ -1,3 +1,4 @@
+# SPDX-License-Identifier: Apache-2.0
 """Attention layer ROCm GPUs."""
 from dataclasses import dataclass
 from typing import TYPE_CHECKING, Any, Dict, List, Optional, Tuple, Type
@@ -90,6 +91,17 @@ class ROCmFlashAttentionMetadata(AttentionMetadata, PagedAttentionMetadata):
     seq_lens: Optional[List[int]]
     # seq_lens stored as a tensor.
     seq_lens_tensor: Optional[torch.Tensor]
+    # Maximum sequence length among prefill batch. 0 if there are decoding
+    # requests only.
+    max_prefill_seq_len: int
+    # Maximum sequence length among decode batch. 0 if there are prefill
+    # requests only.
+    max_decode_seq_len: int
+
+    # Whether or not if cuda graph is enabled.
+    # Cuda-graph is currently enabled for decoding only.
+    # TODO(woosuk): Move `use_cuda_graph` out since it's unrelated to attention.
+    use_cuda_graph: bool
 
     # NOTE(sang): Definition of context_len, query_len, and seq_len.
     # |---------- N-1 iteration --------|
@@ -100,30 +112,18 @@ class ROCmFlashAttentionMetadata(AttentionMetadata, PagedAttentionMetadata):
     #                                   |-- query_len ---|
 
     # Maximum query length in the batch. None for decoding.
-    max_query_len: Optional[int]
-    # Maximum sequence length among prefill batch. 0 if there are decoding
-    # requests only.
-    max_prefill_seq_len: int
-    # Maximum sequence length among decode batch. 0 if there are prefill
-    # requests only.
-    max_decode_seq_len: int
+    max_query_len: Optional[int] = None
     # (batch_size + 1,). The cumulative subquery lengths of the sequences in
     # the batch, used to index into subquery. E.g., if the subquery length
     # is [4, 6], it is [0, 4, 10].
-    query_start_loc: Optional[torch.Tensor]
+    query_start_loc: Optional[torch.Tensor] = None
     # (batch_size + 1,). The cumulative sequence lengths of the sequences in
     # the batch, used to index into sequence. E.g., if the sequence length is
     # [4, 6], it is [0, 4, 10].
-    seq_start_loc: Optional[torch.Tensor]
-
-    # Whether or not if cuda graph is enabled.
-    # Cuda-graph is currently enabled for decoding only.
-    # TODO(woosuk): Move `use_cuda_graph` out since it's unrelated to attention.
-    use_cuda_graph: bool
-
+    seq_start_loc: Optional[torch.Tensor] = None
     # (batch_size,) A tensor of context lengths (tokens that are computed
     # so far).
-    context_lens_tensor: Optional[torch.Tensor]
+    context_lens_tensor: Optional[torch.Tensor] = None
 
     # Max number of query tokens among request in the batch.
     max_decode_query_len: Optional[int] = None
@@ -131,6 +131,23 @@ class ROCmFlashAttentionMetadata(AttentionMetadata, PagedAttentionMetadata):
     _cached_prefill_metadata: Optional["ROCmFlashAttentionMetadata"] = None
     _cached_decode_metadata: Optional["ROCmFlashAttentionMetadata"] = None
 
+    # Begin encoder attn & enc/dec cross-attn fields...
+
+    # Encoder sequence lengths representation
+    encoder_seq_lens: Optional[List[int]] = None
+    encoder_seq_lens_tensor: Optional[torch.Tensor] = None
+
+    # Maximum sequence length among encoder sequences
+    max_encoder_seq_len: Optional[int] = None
+
+    # Number of tokens input to encoder
+    num_encoder_tokens: Optional[int] = None
+
+    # Cross-attention memory-mapping data structures: slot mapping
+    # and block tables
+    cross_slot_mapping: Optional[torch.Tensor] = None
+    cross_block_tables: Optional[torch.Tensor] = None
+
     @property
     def prefill_metadata(self) -> Optional["ROCmFlashAttentionMetadata"]:
         if self.num_prefills == 0:
@@ -141,10 +158,7 @@ class ROCmFlashAttentionMetadata(AttentionMetadata, PagedAttentionMetadata):
 
         assert self.seq_lens is not None
         assert self.seq_lens_tensor is not None
-        assert self.query_start_loc is not None
-        assert self.context_lens_tensor is not None
         assert self.block_tables is not None
-        assert self.seq_start_loc is not None
 
         self._cached_prefill_metadata = ROCmFlashAttentionMetadata(
             num_prefills=self.num_prefills,
@@ -159,12 +173,20 @@ class ROCmFlashAttentionMetadata(AttentionMetadata, PagedAttentionMetadata):
             max_query_len=self.max_query_len,
             max_prefill_seq_len=self.max_prefill_seq_len,
             max_decode_seq_len=0,
-            query_start_loc=self.query_start_loc[:self.num_prefills + 1],
-            seq_start_loc=self.seq_start_loc[:self.num_prefills + 1],
-            context_lens_tensor=self.context_lens_tensor[:self.num_prefills],
+            query_start_loc=None if self.query_start_loc is None else
+            self.query_start_loc[:self.num_prefills + 1],
+            seq_start_loc=None if self.seq_start_loc is None else
+            self.seq_start_loc[:self.num_prefills + 1],
+            context_lens_tensor=None if self.context_lens_tensor is None else
+            self.context_lens_tensor[:self.num_prefills],
             block_tables=self.block_tables[:self.num_prefills],
             use_cuda_graph=False,
-        )
+            # Begin encoder & cross attn fields below...
+            encoder_seq_lens=self.encoder_seq_lens,
+            encoder_seq_lens_tensor=self.encoder_seq_lens_tensor,
+            max_encoder_seq_len=self.max_encoder_seq_len,
+            cross_slot_mapping=self.cross_slot_mapping,
+            cross_block_tables=self.cross_block_tables)
         return self._cached_prefill_metadata
 
     @property
@@ -194,7 +216,12 @@ class ROCmFlashAttentionMetadata(AttentionMetadata, PagedAttentionMetadata):
             context_lens_tensor=None,
             block_tables=self.block_tables[self.num_prefills:],
             use_cuda_graph=self.use_cuda_graph,
-        )
+            # Begin encoder & cross attn fields below...
+            encoder_seq_lens=self.encoder_seq_lens,
+            encoder_seq_lens_tensor=self.encoder_seq_lens_tensor,
+            max_encoder_seq_len=self.max_encoder_seq_len,
+            cross_slot_mapping=self.cross_slot_mapping,
+            cross_block_tables=self.cross_block_tables)
         # Batch may be composed of prefill|decodes, adjust query start indices
         # to refer to the start of decodes when the two are split apart.
         # E.g. in tokens:[3 prefills|6 decodes], query_start_loc=[3,9] => [0,6].
@@ -304,6 +331,97 @@ def _make_alibi_bias(alibi_slopes: torch.Tensor,
     return attn_biases
 
 
+def _get_seq_len_block_table_args(
+    attn_metadata: ROCmFlashAttentionMetadata,
+    attn_type: str,
+) -> tuple:
+    '''
+    The particular choice of sequence-length
+    attributes which should be extracted from attn_metadata is dependent
+    on the type of attention operation.
+
+    Decoder attn -> select entirely decoder self-attention-related fields
+    Encoder/decoder cross-attn -> select encoder sequence lengths
+    Encoder attn -> select encoder sequence lengths fields
+    
+    Arguments:
+
+    * attn_metadata: Attention metadata structure associated with attention op
+    * attn_type: encoder attention, decoder self-attention,
+                encoder/decoder cross-attention
+
+    Returns:
+
+    * Appropriate sequence-lengths tensors for query and key
+    * Appropriate max sequence-length scalar
+    '''
+
+    partial_prefix_sum = 0
+    if attn_type == AttentionType.ENCODER:
+        assert attn_metadata.encoder_seq_lens is not None
+        assert attn_metadata.encoder_seq_lens_tensor is not None
+        query_seq_start_loc = torch.tensor(
+            [0] + [
+                partial_prefix_sum := partial_prefix_sum + i
+                for i in attn_metadata.encoder_seq_lens
+            ],
+            device=attn_metadata.encoder_seq_lens_tensor.device,
+            dtype=attn_metadata.encoder_seq_lens_tensor.dtype)
+        causal_mask = False
+
+        # No block tables associated with encoder attention
+        return (query_seq_start_loc, attn_metadata.max_encoder_seq_len,
+                query_seq_start_loc, attn_metadata.max_encoder_seq_len,
+                attn_metadata.encoder_seq_lens, causal_mask)
+    elif attn_type == AttentionType.DECODER:
+        # Decoder self-attention
+        # Choose max_seq_len based on whether we are in prompt_run
+        assert attn_metadata.seq_lens is not None
+        assert attn_metadata.seq_lens_tensor is not None
+        query_seq_start_loc = torch.tensor(
+            [0] + [
+                partial_prefix_sum := partial_prefix_sum + i
+                for i in attn_metadata.seq_lens
+            ],
+            device=attn_metadata.seq_lens_tensor.device,
+            dtype=attn_metadata.seq_lens_tensor.dtype)
+        max_seq_len = attn_metadata.max_prefill_seq_len
+        causal_mask = True
+
+        return (query_seq_start_loc, max_seq_len, query_seq_start_loc,
+                max_seq_len, attn_metadata.seq_lens, causal_mask)
+    elif attn_type == AttentionType.ENCODER_DECODER:
+        assert attn_metadata.seq_lens is not None
+        assert attn_metadata.encoder_seq_lens_tensor is not None
+        query_start_loc = torch.tensor(
+            [0] + [
+                partial_prefix_sum := partial_prefix_sum + i
+                for i in attn_metadata.seq_lens
+            ],
+            device=attn_metadata.encoder_seq_lens_tensor.device,
+            dtype=attn_metadata.encoder_seq_lens_tensor.dtype)
+
+        partial_prefix_sum = 0
+        assert attn_metadata.encoder_seq_lens is not None
+        assert attn_metadata.seq_lens_tensor is not None
+        key_seq_start_loc = torch.tensor(
+            [0] + [
+                partial_prefix_sum := partial_prefix_sum + i
+                for i in attn_metadata.encoder_seq_lens
+            ],
+            device=attn_metadata.seq_lens_tensor.device,
+            dtype=attn_metadata.seq_lens_tensor.dtype)
+        causal_mask = False
+
+        # Enc/dec cross-attention KVs match encoder sequence length;
+        # cross-attention utilizes special "cross" block tables
+        return (query_start_loc, attn_metadata.max_prefill_seq_len,
+                key_seq_start_loc, attn_metadata.max_encoder_seq_len,
+                attn_metadata.seq_lens, causal_mask)
+    else:
+        raise AttributeError(f"Invalid attention type {str(attn_type)}")
+
+
 class ROCmFlashAttentionImpl(AttentionImpl):
     """
     If the input tensors contain prompt tokens, the layout is as follows:
@@ -346,10 +464,13 @@ class ROCmFlashAttentionImpl(AttentionImpl):
         if blocksparse_params is not None:
             raise ValueError(
                 "ROCmFlashAttention does not support blocksparse attention.")
-        if logits_soft_cap is not None:
-            raise ValueError(
-                "ROCmFlashAttention does not support attention logits soft "
-                "capping.")
+
+        if logits_soft_cap is None:
+            # In flash-attn, setting logits_soft_cap as 0 means no soft cap.
+            self.logits_soft_cap = 0.0
+        else:
+            self.logits_soft_cap = logits_soft_cap
+        self.attn_type = attn_type
         self.num_heads = num_heads
         self.head_size = head_size
         self.scale = float(scale)
@@ -374,6 +495,14 @@ class ROCmFlashAttentionImpl(AttentionImpl):
         # NOTE: Allow for switching between Triton and CK. Defaulting to triton.
         self.use_triton_flash_attn = envs.VLLM_USE_TRITON_FLASH_ATTN
         if self.use_triton_flash_attn:
+            if logits_soft_cap is not None:
+                raise ValueError(
+                    "ROCm Triton FlashAttention does not support attention"
+                    "logits soft capping."
+                    " please try using the ROCm CK "
+                    "FA backend instead by setting the env var "
+                    "`VLLM_USE_TRITON_FLASH_ATTN=0`")
+
             from vllm.attention.ops.triton_flash_attention import (  # noqa: F401
                 triton_attention)
             self.attn_func = triton_attention
@@ -398,14 +527,13 @@ class ROCmFlashAttentionImpl(AttentionImpl):
                     self.use_naive_attn = True
 
             if self.use_naive_attn:
-                self.attn_func = _sdpa_attention
-                logger.debug("Using naive attention in ROCmBackend")
+                if logits_soft_cap is not None:
+                    raise ValueError(
+                        "ROCm Naive FlashAttention does not support"
+                        "attention logits soft capping.")
 
-        if attn_type != AttentionType.DECODER:
-            raise NotImplementedError("Encoder self-attention and "
-                                      "encoder/decoder cross-attention "
-                                      "are not implemented for "
-                                      "ROCmFlashAttentionImpl")
+                self.attn_func = _sdpa_attention
+                logger.debug("Using naive (SDPA) attention in ROCmBackend")
 
     def repeat_kv(self, x: torch.Tensor, n_rep: int) -> torch.Tensor:
         """torch.repeat_interleave(x, dim=1, repeats=n_rep)"""
@@ -427,6 +555,37 @@ class ROCmFlashAttentionImpl(AttentionImpl):
     ) -> torch.Tensor:
         """Forward pass with FlashAttention and PagedAttention.
 
+        For decoder-only models: query, key and value must be non-None.
+
+        For encoder/decoder models:
+        * ROCmFlashAttentionImpl.forward() may be invoked for both self- and 
+            cross-attention layers.
+        * For self-attention: query, key and value must be non-None.
+        * For cross-attention:
+            * Query must be non-None
+            * During prefill, key and value must be non-None; key and value
+              get cached for use during decode.
+            * During decode, key and value may be None, since:
+              (1) key and value tensors were cached during prefill, and
+              (2) cross-attention key and value tensors do not grow during
+                  decode
+        
+        A note on how the attn_type (attention type enum) argument impacts
+        attention forward() behavior:
+    
+            * DECODER: normal decoder-only behavior;
+                use decoder self-attention block table
+            * ENCODER: no KV caching; pass encoder sequence
+                attributes (encoder_seq_lens/encoder_seq_lens_tensor/
+                max_encoder_seq_len) to kernel, in lieu of decoder
+                sequence attributes (seq_lens/seq_lens_tensor/max_seq_len)
+            * ENCODER_DECODER: cross-attention behavior;
+                use cross-attention block table for caching KVs derived
+                from encoder hidden states; since KV sequence lengths
+                will match encoder sequence lengths, pass encoder sequence
+                attributes to kernel (encoder_seq_lens/encoder_seq_lens_tensor/
+                max_encoder_seq_len)
+
         Args:
             query: shape = [num_tokens, num_heads * head_size]
             key: shape = [num_tokens, num_kv_heads * head_size]
@@ -435,54 +594,80 @@ class ROCmFlashAttentionImpl(AttentionImpl):
                 NOTE: kv_cache will be an empty tensor with shape [0]
                 for profiling run.
             attn_metadata: Metadata for attention.
+            attn_type: Select attention type, between encoder attention,
+                       decoder self-attention, or encoder/decoder cross-
+                       attention. Defaults to decoder self-attention,
+                       which is the vLLM default generally
         Returns:
             shape = [num_tokens, num_heads * head_size]
         """
-        # Reminder: Please update docs/source/features/compatibility_matrix.md
-        # If the feature combo become valid
-        num_tokens, hidden_size = query.shape
-        # Reshape the query, key, and value tensors.
         query = query.view(-1, self.num_heads, self.head_size)
-        key = key.view(-1, self.num_kv_heads, self.head_size)
-        value = value.view(-1, self.num_kv_heads, self.head_size)
+        if key is not None:
+            assert value is not None
+            key = key.view(-1, self.num_kv_heads, self.head_size)
+            value = value.view(-1, self.num_kv_heads, self.head_size)
+        else:
+            assert value is None
 
-        if kv_cache.numel() > 0:
+        if self.attn_type != AttentionType.ENCODER and kv_cache.numel() > 0:
             key_cache, value_cache = PagedAttention.split_kv_cache(
                 kv_cache, self.num_kv_heads, self.head_size)
 
-            # Reshape the input keys and values and store them in the cache.
-            # If kv_cache is not provided, the new key and value tensors are
-            # not cached. This happens during the initial memory profiling run.
-            PagedAttention.write_to_paged_cache(
-                key,
-                value,
-                key_cache,
-                value_cache,
-                attn_metadata.slot_mapping,
-                self.kv_cache_dtype,
-                layer._k_scale,
-                layer._v_scale,
-            )
+            if key is not None and value is not None:
+                # Reshape the input keys and values and store them in the
+                # cache. If kv_cache is not provided, the new key and value
+                # tensors are not cached. This happens during the initial
+                # memory profiling run.
+                PagedAttention.write_to_paged_cache(
+                    key,
+                    value,
+                    key_cache,
+                    value_cache,
+                    attn_metadata.slot_mapping
+                    if self.attn_type != AttentionType.ENCODER_DECODER else
+                    attn_metadata.cross_slot_mapping,
+                    self.kv_cache_dtype,
+                    layer._k_scale,
+                    layer._v_scale,
+                )
 
-        num_prefill_tokens = attn_metadata.num_prefill_tokens
-        num_decode_tokens = attn_metadata.num_decode_tokens
-        assert key.shape[0] == num_prefill_tokens + num_decode_tokens
-        assert value.shape[0] == num_prefill_tokens + num_decode_tokens
+        if self.attn_type != AttentionType.ENCODER:
+            num_prefill_tokens = attn_metadata.num_prefill_tokens
+        else:
+            assert attn_metadata.num_encoder_tokens is not None
+            num_prefill_tokens = attn_metadata.num_encoder_tokens
 
         output = torch.empty_like(query)
         # Query for decode. KV is not needed because it is already cached.
         decode_query = query[num_prefill_tokens:]
         # QKV for prefill.
         query = query[:num_prefill_tokens]
-        key = key[:num_prefill_tokens]
-        value = value[:num_prefill_tokens]
 
-        assert query.shape[0] == num_prefill_tokens
-        assert decode_query.shape[0] == num_decode_tokens
+        if key is not None and value is not None \
+            and self.attn_type != AttentionType.ENCODER_DECODER:
+            key = key[:num_prefill_tokens]
+            value = value[:num_prefill_tokens]
 
         if prefill_meta := attn_metadata.prefill_metadata:
             # Prompt run.
-            assert prefill_meta.seq_lens is not None
+            # normal attention and DECODER
+            if self.attn_type == AttentionType.DECODER and (
+                    kv_cache.numel() == 0 or prefill_meta.block_tables is None
+                    or prefill_meta.block_tables.numel() == 0):
+                (query_seq_start_loc, query_max_seq_len, key_seq_start_loc,
+                 key_max_seq_len, seq_lens,
+                 causal_mask) = (prefill_meta.seq_start_loc,
+                                 prefill_meta.max_prefill_seq_len,
+                                 prefill_meta.seq_start_loc,
+                                 prefill_meta.max_prefill_seq_len,
+                                 attn_metadata.seq_lens, True)
+            # prefix-enabled attention and ENCODER/ENCODER_DECODER
+            else:
+                (query_seq_start_loc, query_max_seq_len, key_seq_start_loc,
+                 key_max_seq_len, seq_lens,
+                 causal_mask) = _get_seq_len_block_table_args(
+                     prefill_meta, self.attn_type)
+            # Prompt run.
             if kv_cache.numel() == 0 or prefill_meta.block_tables.numel() == 0:
                 # triton attention
                 # When block_tables are not filled, it means q and k are the
@@ -493,18 +678,18 @@ class ROCmFlashAttentionImpl(AttentionImpl):
                         attn_masks = _make_alibi_bias(
                             self.alibi_slopes,
                             query.dtype,
-                            attn_metadata.seq_lens,
+                            seq_lens,
                             make_attn_mask=False)  # type: ignore
                     out, _ = self.attn_func(
                         query,
                         key,
                         value,
                         None,
-                        prefill_meta.seq_start_loc,
-                        prefill_meta.seq_start_loc,
-                        prefill_meta.max_prefill_seq_len,
-                        prefill_meta.max_prefill_seq_len,
-                        True,
+                        query_seq_start_loc,
+                        key_seq_start_loc,
+                        query_max_seq_len,
+                        key_max_seq_len,
+                        causal_mask,
                         self.scale,
                         attn_masks[0][None]
                         if attn_masks is not None else None,
@@ -528,11 +713,12 @@ class ROCmFlashAttentionImpl(AttentionImpl):
                         query,
                         key,
                         value,
-                        prefill_meta.seq_lens,
-                        num_tokens,
+                        query_seq_start_loc,
+                        num_prefill_tokens,
                         self.num_heads,
                         self.head_size,
                         self.scale,
+                        causal_mask,
                         attn_masks,
                     )
                 else:
@@ -540,19 +726,23 @@ class ROCmFlashAttentionImpl(AttentionImpl):
                         q=query,
                         k=key,
                         v=value,
-                        cu_seqlens_q=prefill_meta.seq_start_loc,
-                        cu_seqlens_k=prefill_meta.seq_start_loc,
+                        cu_seqlens_q=query_seq_start_loc,
+                        cu_seqlens_k=key_seq_start_loc,
                         max_seqlen_q=prefill_meta.max_prefill_seq_len,
-                        max_seqlen_k=prefill_meta.max_prefill_seq_len,
+                        max_seqlen_k=key_max_seq_len,
                         softmax_scale=self.scale,
                         causal=True,
                         window_size=self.sliding_window,
                         alibi_slopes=self.alibi_slopes,
+                        softcap=self.logits_soft_cap,
                     )
 
                 # common code for prefill
                 assert output[:num_prefill_tokens].shape == out.shape
-                output[:num_prefill_tokens] = out
+                if output.shape[0] > num_prefill_tokens:
+                    output[:num_prefill_tokens] = out
+                else:
+                    output = out
             else:
                 # prefix-enabled attention
                 output[:num_prefill_tokens] = PagedAttention.forward_prefix(
@@ -583,7 +773,10 @@ class ROCmFlashAttentionImpl(AttentionImpl):
                 decode_query.dtype, head_size, block_size, gqa_ratio,
                 decode_meta.max_decode_seq_len)
             if use_custom:
-                max_seq_len = decode_meta.max_decode_seq_len
+                max_seq_len = (decode_meta.max_decode_seq_len if self.attn_type
+                               != AttentionType.ENCODER_DECODER else
+                               decode_meta.max_encoder_seq_len)
+                assert max_seq_len is not None
                 max_num_partitions = (
                     (max_seq_len + _PARTITION_SIZE_ROCM - 1) //
                     _PARTITION_SIZE_ROCM)
@@ -599,8 +792,12 @@ class ROCmFlashAttentionImpl(AttentionImpl):
                     device=output.device,
                 )
                 max_logits = torch.empty_like(exp_sums)
+                if num_prefill_tokens > 0:
+                    out = output[num_prefill_tokens:]
+                else:
+                    out = output
                 ops.paged_attention_rocm(
-                    output[num_prefill_tokens:],
+                    out,
                     exp_sums,
                     max_logits,
                     tmp_output,
@@ -609,8 +806,12 @@ class ROCmFlashAttentionImpl(AttentionImpl):
                     value_cache,
                     self.num_kv_heads,
                     self.scale,
-                    decode_meta.block_tables,
-                    decode_meta.seq_lens_tensor,
+                    decode_meta.block_tables
+                    if self.attn_type != AttentionType.ENCODER_DECODER else
+                    decode_meta.cross_block_tables,
+                    decode_meta.seq_lens_tensor
+                    if self.attn_type != AttentionType.ENCODER_DECODER else
+                    decode_meta.encoder_seq_lens_tensor,
                     block_size,
                     max_seq_len,
                     self.alibi_slopes,
@@ -623,9 +824,15 @@ class ROCmFlashAttentionImpl(AttentionImpl):
                     decode_query,
                     key_cache,
                     value_cache,
-                    decode_meta.block_tables,
-                    decode_meta.seq_lens_tensor,
-                    decode_meta.max_decode_seq_len,
+                    decode_meta.block_tables
+                    if self.attn_type != AttentionType.ENCODER_DECODER else
+                    decode_meta.cross_block_tables,
+                    decode_meta.seq_lens_tensor
+                    if self.attn_type != AttentionType.ENCODER_DECODER else
+                    decode_meta.encoder_seq_lens_tensor,
+                    decode_meta.max_decode_seq_len
+                    if self.attn_type != AttentionType.ENCODER_DECODER else
+                    decode_meta.max_encoder_seq_len,
                     self.kv_cache_dtype,
                     self.num_kv_heads,
                     self.scale,
@@ -635,7 +842,7 @@ class ROCmFlashAttentionImpl(AttentionImpl):
                 )
 
         # Reshape the output tensor.
-        return output.view(num_tokens, hidden_size)
+        return output.view(-1, self.num_heads * self.head_size)
 
 
 def _sdpa_attention(
diff --git a/vllm/attention/backends/torch_sdpa.py b/vllm/attention/backends/torch_sdpa.py
index c3b2398b4e632..25fe6ed95c5df 100644
--- a/vllm/attention/backends/torch_sdpa.py
+++ b/vllm/attention/backends/torch_sdpa.py
@@ -1,3 +1,4 @@
+# SPDX-License-Identifier: Apache-2.0
 """ Attention layer with torch scaled_dot_product_attention
     and PagedAttention."""
 from dataclasses import dataclass
diff --git a/vllm/attention/backends/triton_mla.py b/vllm/attention/backends/triton_mla.py
new file mode 100644
index 0000000000000..20d7ef0fa88e0
--- /dev/null
+++ b/vllm/attention/backends/triton_mla.py
@@ -0,0 +1,747 @@
+# SPDX-License-Identifier: Apache-2.0
+
+from collections import defaultdict
+from contextlib import contextmanager
+from dataclasses import dataclass
+from itertools import accumulate
+from typing import TYPE_CHECKING, Any, Dict, List, Optional, Tuple, Type
+
+from vllm.multimodal import MultiModalPlaceholderMap
+
+try:
+    from flashinfer import BatchDecodeMlaWithPagedKVCacheWrapper
+    FLASHINFER_WORKSPACE_BUFFER_SIZE = 256 * 1024 * 1024
+except ImportError:
+    BatchDecodeMlaWithPagedKVCacheWrapper = None
+    FLASHINFER_WORKSPACE_BUFFER_SIZE = 0
+
+import torch
+
+from vllm import _custom_ops as ops
+from vllm.attention.backends.abstract import (AttentionBackend,
+                                              AttentionMetadata,
+                                              AttentionMetadataBuilder,
+                                              AttentionState, AttentionType)
+from vllm.attention.backends.mla.utils import MLACommonImpl, MLACommonMetadata
+from vllm.attention.backends.utils import (PAD_SLOT_ID, compute_slot_mapping,
+                                           compute_slot_mapping_start_idx,
+                                           is_block_tables_empty)
+from vllm.attention.ops.paged_attn import PagedAttention
+from vllm.attention.ops.triton_decode_attention import decode_attention_fwd
+from vllm.utils import async_tensor_h2d, make_tensor_with_pad
+
+if TYPE_CHECKING:
+    from vllm.worker.model_runner import (ModelInputForGPUBuilder,
+                                          ModelInputForGPUWithSamplingMetadata)
+
+
+class TritonMLABackend(AttentionBackend):
+
+    @staticmethod
+    def get_name() -> str:
+        return "TRITON_MLA"
+
+    @staticmethod
+    def get_impl_cls() -> Type["TritonMLAImpl"]:
+        return TritonMLAImpl
+
+    @staticmethod
+    def get_metadata_cls() -> Type["AttentionMetadata"]:
+        return TritonMLAMetadata
+
+    @staticmethod
+    def get_builder_cls() -> Type["TritonMLAMetadataBuilder"]:
+        return TritonMLAMetadataBuilder
+
+    @staticmethod
+    def get_state_cls() -> Type["TritonMLAState"]:
+        return TritonMLAState
+
+    @staticmethod
+    def get_kv_cache_shape(
+        num_blocks: int,
+        block_size: int,
+        num_kv_heads: int,  # assumed to be 1 for MLA
+        head_size: int,
+    ) -> Tuple[int, ...]:
+        return (num_blocks, block_size, head_size)
+
+    @staticmethod
+    def swap_blocks(
+        src_kv_cache: torch.Tensor,
+        dst_kv_cache: torch.Tensor,
+        src_to_dst: torch.Tensor,
+    ) -> None:
+        PagedAttention.swap_blocks(src_kv_cache, dst_kv_cache, src_to_dst)
+
+    @staticmethod
+    def copy_blocks(
+        kv_caches: List[torch.Tensor],
+        src_to_dists: torch.Tensor,
+    ) -> None:
+        PagedAttention.copy_blocks(kv_caches, src_to_dists)
+
+    @staticmethod
+    def get_supported_head_sizes() -> List[int]:
+        return [576]
+
+
+class TritonMLAState(AttentionState):
+
+    def __init__(self, runner):
+        self.runner = runner
+        self._is_graph_capturing = False
+
+    @contextmanager
+    def graph_capture(self, max_batch_size: int):
+        self._is_graph_capturing = True
+
+        self._graph_slot_mapping = torch.full((max_batch_size, ),
+                                              PAD_SLOT_ID,
+                                              dtype=torch.long,
+                                              device=self.runner.device)
+        self._graph_seq_lens = torch.ones(max_batch_size,
+                                          dtype=torch.int32,
+                                          device=self.runner.device)
+        self._graph_block_tables = torch.from_numpy(
+            self.runner.graph_block_tables).to(device=self.runner.device)
+
+        self._positions = torch.zeros((max_batch_size, ),
+                                      dtype=torch.long,
+                                      device=self.runner.device)
+
+        yield
+
+        self._is_graph_capturing = False
+        del self._graph_slot_mapping
+        del self._graph_seq_lens
+        del self._graph_block_tables
+        del self._positions
+
+    def graph_clone(self, batch_size: int):
+        assert self._is_graph_capturing
+        return self.__class__(self.runner)
+
+    def graph_capture_get_metadata_for_batch(
+            self, batch_size: int, is_encoder_decoder_model: bool = False):
+        assert self._is_graph_capturing
+
+        attn_metadata = self.runner.attn_backend.make_metadata(
+            num_prefills=0,
+            num_prefill_tokens=0,
+            num_decode_tokens=batch_size,
+            slot_mapping=self._graph_slot_mapping[:batch_size],
+            multi_modal_placeholder_index_maps=None,
+            enable_kv_scales_calculation=True,
+            seq_lens=None,
+            seq_lens_tensor=self._graph_seq_lens[:batch_size],
+            max_query_len=1,
+            max_decode_query_len=1,
+            max_prefill_seq_len=0,
+            max_decode_seq_len=self.runner.max_seq_len_to_capture,
+            query_start_loc=None,
+            seq_start_loc=None,
+            context_lens_tensor=None,
+            block_tables=self._graph_block_tables[:batch_size],
+            use_cuda_graph=True,
+            input_positions=self._positions[:batch_size],
+            head_dim=self.runner.model_config.get_head_size())
+
+        if is_encoder_decoder_model:
+            raise NotImplementedError(
+                "TritonMLAState does not support encoder/decoder yet")
+
+        return attn_metadata
+
+    def get_graph_input_buffers(self,
+                                attn_metadata,
+                                is_encoder_decoder_model: bool = False):
+        input_buffers = {
+            "slot_mapping": attn_metadata.slot_mapping,
+            "seq_lens_tensor": attn_metadata.decode_metadata.seq_lens_tensor,
+            "block_tables": attn_metadata.decode_metadata.block_tables,
+            "input_positions": attn_metadata.decode_metadata.input_positions,
+        }
+        if is_encoder_decoder_model:
+            raise NotImplementedError(
+                "TritonMLAState does not support encoder/decoder yet")
+
+        return input_buffers
+
+    def prepare_graph_input_buffers(self,
+                                    input_buffers,
+                                    attn_metadata,
+                                    is_encoder_decoder_model: bool = False):
+        input_positions = attn_metadata.input_positions
+        num_positions = input_positions.shape[0]
+        input_buffers["seq_lens_tensor"].copy_(
+            attn_metadata.decode_metadata.seq_lens_tensor, non_blocking=True)
+        input_buffers["block_tables"].copy_(
+            attn_metadata.decode_metadata.block_tables, non_blocking=True)
+        # CUDA graph buffer is padded so only perform a partial copy based on
+        # num_positions
+        input_buffers["input_positions"][:num_positions].copy_(
+            input_positions, non_blocking=True)
+        if is_encoder_decoder_model:
+            raise NotImplementedError(
+                "TritonMLAState does not support encoder/decoder yet")
+
+    def begin_forward(self, model_input):
+        return
+
+
+@dataclass
+class TritonMLAMetadata(MLACommonMetadata):
+    """Metadata for TritonMLAMetadata.
+
+    NOTE: Any python object stored here is not updated when it is
+    cuda-graph replayed. If you have values that need to be changed
+    dynamically, it should be stored in tensor. The tensor has to be
+    updated from `CUDAGraphRunner.forward` API.
+    """
+    # (batch_size,). The sequence length per sequence. Sequence length means
+    # the computed tokens + new tokens None if it is a decoding.
+    seq_lens: Optional[List[int]]
+    # seq_lens stored as a tensor.
+    seq_lens_tensor: Optional[torch.Tensor]
+
+    # NOTE(sang): Definition of context_len, query_len, and seq_len.
+    # |---------- N-1 iteration --------|
+    # |---------------- N iteration ---------------------|
+    # |- tokenA -|......................|-- newTokens ---|
+    # |---------- context_len ----------|
+    # |-------------------- seq_len ---------------------|
+    #                                   |-- query_len ---|
+
+    # Maximum sequence length among prefill batch. 0 if there are decoding
+    # requests only.
+    max_prefill_seq_len: int
+    # Maximum sequence length among decode batch. 0 if there are prefill
+    # requests only.
+    max_decode_seq_len: int
+    # (batch_size,) A tensor of context lengths (tokens that are computed
+    # so far).
+    context_lens_tensor: Optional[torch.Tensor]
+
+    # (batch_size, max_blocks_per_seq).
+    # Block addresses per sequence. (Seq id -> list of physical block)
+    # E.g., [0, 1, 2] means tokens are stored in 0th, 1st, and 2nd blocks
+    # in the kv cache. Each block can contain up to block_size tokens.
+    # 2nd dimensions are padded up to max_blocks_per_seq if it is cuda-graph
+    # captured.
+    block_tables: Optional[torch.Tensor]
+
+    # Whether or not if cuda graph is enabled.
+    # Cuda-graph is currently enabled for decoding only.
+    # TODO(woosuk): Move `use_cuda_graph` out since it's unrelated to attention.
+
+    use_cuda_graph: bool
+
+    # Maximum query length in the batch.
+    max_query_len: Optional[int] = None
+
+    # Max number of query tokens among request in the batch.
+    max_decode_query_len: Optional[int] = None
+
+    # (batch_size + 1,). The cumulative subquery lengths of the sequences in
+    # the batch, used to index into subquery. E.g., if the subquery length
+    # is [4, 6], it is [0, 4, 10].
+    query_start_loc: Optional[torch.Tensor] = None
+    # (batch_size + 1,). The cumulative sequence lengths of the sequences in
+    # the batch, used to index into sequence. E.g., if the sequence length is
+    # [4, 6], it is [0, 4, 10].
+    seq_start_loc: Optional[torch.Tensor] = None
+
+    _cached_prefill_metadata: Optional["TritonMLAMetadata"] = None
+    _cached_decode_metadata: Optional["TritonMLAMetadata"] = None
+
+    num_prefill_tokens: int
+
+    num_kv_splits: int = 4  # TODO(lucas) add heuristic
+    attn_logits: Optional[torch.Tensor] = None
+    req_idx: Optional[torch.Tensor] = None
+
+    # The dimension of the attention heads
+    head_dim: Optional[int] = None
+
+    def __post_init__(self):
+        supported_head_sizes = TritonMLABackend.get_supported_head_sizes()
+        if self.head_dim is not None and self.head_dim \
+                not in supported_head_sizes:
+            raise ValueError(
+                f"Only {supported_head_sizes} are supported for head_dim,",
+                f"received {self.head_dim}.")
+
+    @property
+    def prefill_metadata(self) -> Optional["TritonMLAMetadata"]:
+        if self.num_prefills == 0:
+            return None
+
+        if self._cached_prefill_metadata is not None:
+            return self._cached_prefill_metadata
+
+        assert self.seq_lens is not None
+        assert self.seq_lens_tensor is not None
+
+        # Compute some attn_metadata fields which default to None
+        query_start_loc = (None if self.query_start_loc is None else
+                           self.query_start_loc[:self.num_prefills + 1])
+        slot_mapping = (None if self.slot_mapping is None else
+                        self.slot_mapping[:self.num_prefill_tokens])
+        seq_lens = (None if self.seq_lens is None else
+                    self.seq_lens[:self.num_prefills])
+        seq_lens_tensor = (None if self.seq_lens_tensor is None else
+                           self.seq_lens_tensor[:self.num_prefills])
+        seq_start_loc = (None if self.seq_start_loc is None else
+                         self.seq_start_loc[:self.num_prefills + 1])
+        context_lens_tensor = (None if self.context_lens_tensor is None else
+                               self.context_lens_tensor[:self.num_prefills])
+        block_tables = (None if self.block_tables is None else
+                        self.block_tables[:self.num_prefills])
+        input_positions = (None if self.input_positions is None else
+                           self.input_positions[:self.num_prefill_tokens])
+
+        self._cached_prefill_metadata = TritonMLAMetadata(
+            num_prefills=self.num_prefills,
+            num_prefill_tokens=self.num_prefill_tokens,
+            num_decode_tokens=0,
+            slot_mapping=slot_mapping,
+            multi_modal_placeholder_index_maps=self.
+            multi_modal_placeholder_index_maps,
+            enable_kv_scales_calculation=self.enable_kv_scales_calculation,
+            input_positions=input_positions,
+            seq_lens=seq_lens,
+            seq_lens_tensor=seq_lens_tensor,
+            max_query_len=self.max_query_len,
+            max_prefill_seq_len=self.max_prefill_seq_len,
+            max_decode_query_len=0,
+            max_decode_seq_len=0,
+            query_start_loc=query_start_loc,
+            seq_start_loc=seq_start_loc,
+            context_lens_tensor=context_lens_tensor,
+            block_tables=block_tables,
+            use_cuda_graph=False,
+            head_dim=self.head_dim)
+        return self._cached_prefill_metadata
+
+    @property
+    def decode_metadata(self) -> Optional["TritonMLAMetadata"]:
+        if self.num_decode_tokens == 0:
+            return None
+
+        if self._cached_decode_metadata is not None:
+            return self._cached_decode_metadata
+        assert self.seq_lens_tensor is not None
+
+        # Compute some attn_metadata fields which default to None
+        slot_mapping = (None if self.slot_mapping is None else
+                        self.slot_mapping[self.num_prefill_tokens:])
+        seq_lens_tensor = (None if self.seq_lens_tensor is None else
+                           self.seq_lens_tensor[self.num_prefills:])
+        block_tables = (None if self.block_tables is None else
+                        self.block_tables[self.num_prefills:])
+        input_positions = (None if self.input_positions is None else
+                           self.input_positions[self.num_prefill_tokens:])
+
+        self._cached_decode_metadata = TritonMLAMetadata(
+            num_prefills=0,
+            num_prefill_tokens=0,
+            num_decode_tokens=self.num_decode_tokens,
+            slot_mapping=slot_mapping,
+            multi_modal_placeholder_index_maps=None,
+            enable_kv_scales_calculation=True,
+            seq_lens=None,
+            seq_lens_tensor=seq_lens_tensor,
+            max_decode_query_len=self.max_decode_query_len,
+            max_query_len=self.max_query_len,
+            max_prefill_seq_len=0,
+            max_decode_seq_len=self.max_decode_seq_len,
+            # Batch may be composed of prefill|decodes, adjust query start
+            # indices to refer to the start of decodes. E.g.
+            # in tokens:[3 prefills|6 decodes], query_start_loc=[3,9] => [0,6].
+            query_start_loc=(self.query_start_loc[self.num_prefills:] -
+                             self.query_start_loc[self.num_prefills])
+            if self.query_start_loc is not None else None,
+            seq_start_loc=self.seq_start_loc[self.num_prefills:]
+            if self.seq_start_loc is not None else None,
+            context_lens_tensor=None,
+            block_tables=block_tables,
+            use_cuda_graph=self.use_cuda_graph,
+            input_positions=input_positions,
+            head_dim=self.head_dim)
+        return self._cached_decode_metadata
+
+    def advance_step(self,
+                     model_input: "ModelInputForGPUWithSamplingMetadata",
+                     sampled_token_ids: Optional[torch.Tensor],
+                     block_size: int,
+                     num_seqs: int,
+                     num_queries: int,
+                     turn_prefills_into_decodes: bool = False):
+        """
+        Update metadata in-place to advance one decode step.
+        """
+        # When using cudagraph, the num_seqs is padded to the next captured
+        # batch sized, but num_queries tracks the actual number of requests in
+        # the batch. For --enforce-eager mode, num_seqs == num_queries
+        if num_seqs != num_queries:
+            assert num_seqs > num_queries
+            assert self.use_cuda_graph
+
+        if turn_prefills_into_decodes:
+            # When Mutli-Step is enabled with Chunked-Prefill, prefills and
+            # decodes are scheduled together. In the first step, all the
+            # prefills turn into decodes. This update reflects that
+            # conversion.
+            assert self.num_decode_tokens + self.num_prefills == num_seqs
+            self.num_decode_tokens += self.num_prefills
+            self.num_prefills = 0
+            self.num_prefill_tokens = 0
+            self.max_prefill_seq_len = 0
+            self.max_query_len = 1
+
+            self.slot_mapping = self.slot_mapping[:num_seqs]
+        else:
+            assert self.seq_lens is not None
+            assert self.max_decode_seq_len == max(self.seq_lens)
+
+        assert self.num_prefills == 0
+        assert self.num_prefill_tokens == 0
+        assert self.num_decode_tokens == num_seqs
+        assert self.slot_mapping.shape == (num_seqs, )
+
+        assert self.seq_lens is not None
+        assert len(self.seq_lens) == num_seqs
+        assert self.seq_lens_tensor is not None
+        assert self.seq_lens_tensor.shape == (num_seqs, )
+        assert self.max_query_len == 1
+        assert self.max_prefill_seq_len == 0
+
+        assert self.query_start_loc is not None
+        assert self.query_start_loc.shape == (num_queries + 1, )
+        assert self.seq_start_loc is not None
+        assert self.seq_start_loc.shape == (num_seqs + 1, )
+
+        assert self.context_lens_tensor is not None
+        assert self.context_lens_tensor.shape == (num_queries, )
+
+        assert self.block_tables is not None
+        assert self.block_tables.shape[0] == num_seqs
+
+        # Update query lengths. Note that we update only queries and not seqs,
+        # since tensors may be padded due to captured cuda graph batch size
+        for i in range(num_queries):
+            self.seq_lens[i] += 1
+        self.max_decode_seq_len = max(self.seq_lens)
+
+        ops.advance_step_flashattn(num_seqs=num_seqs,
+                                   num_queries=num_queries,
+                                   block_size=block_size,
+                                   input_tokens=model_input.input_tokens,
+                                   sampled_token_ids=sampled_token_ids,
+                                   input_positions=model_input.input_positions,
+                                   seq_lens=self.seq_lens_tensor,
+                                   slot_mapping=self.slot_mapping,
+                                   block_tables=self.block_tables)
+
+
+class TritonMLAMetadataBuilder(AttentionMetadataBuilder[TritonMLAMetadata]):
+
+    def __init__(self, input_builder: "ModelInputForGPUBuilder"):
+        self.input_builder = input_builder
+        self.runner = input_builder.runner
+        self.sliding_window = input_builder.sliding_window
+        self.block_size = input_builder.block_size
+
+    def prepare(self):
+        self.slot_mapping: List[int] = []
+        self.prefill_seq_lens: List[int] = []
+        self.context_lens: List[int] = []
+        self.block_tables: List[List[int]] = []
+        self.curr_seq_lens: List[int] = []
+        self.input_positions: List[int] = []
+        self.multimodal_placeholder_maps: Dict[
+            str,
+            MultiModalPlaceholderMap] = defaultdict(MultiModalPlaceholderMap)
+        self.num_prefills = 0
+        self.num_prefill_tokens = 0
+        self.num_decode_tokens = 0
+        self.has_prefix_cache_hit = False
+
+    def _add_seq_group(
+            self, inter_data: "ModelInputForGPUBuilder.InterDataForSeqGroup",
+            chunked_prefill_enabled: bool, prefix_cache_hit: bool):
+        """Add a sequence group to the metadata. Specifically update/append
+        1. context length.
+        2. block table.
+        3. slot mapping.
+        """
+        is_prompt = inter_data.is_prompt
+        block_tables = inter_data.block_tables
+
+        for (seq_id, token_len, seq_len, curr_seq_len, query_len, context_len,
+             curr_sliding_window_block, input_positions) in zip(
+                 inter_data.seq_ids, [len(t) for t in inter_data.input_tokens],
+                 inter_data.orig_seq_lens, inter_data.seq_lens,
+                 inter_data.query_lens, inter_data.context_lens,
+                 inter_data.curr_sliding_window_blocks,
+                 inter_data.input_positions):
+            self.input_positions.extend(input_positions)
+            self.context_lens.append(context_len)
+            if is_prompt:
+                mm_maps = inter_data.multi_modal_placeholder_maps
+                if mm_maps:
+                    for modality, placeholders in mm_maps.items():
+                        self.multimodal_placeholder_maps[modality].extend(
+                            placeholders)
+
+                self.num_prefills += 1
+                self.num_prefill_tokens += token_len
+                self.prefill_seq_lens.append(seq_len)
+            else:
+                self.num_decode_tokens += query_len
+                self.curr_seq_lens.append(curr_seq_len)
+
+            # Compute block table.
+            # TODO(sang): Combine chunked prefill and prefix caching by
+            # only allowing multiple of block_size chunk size.
+            # NOTE: This only works for oooooooxxx style attention.
+            block_table = []
+            if prefix_cache_hit:
+                # NOTE(woosuk): For flash-attn, the block table should
+                # include the entries for the incoming prefill tokens.
+                block_table = block_tables[seq_id]
+            elif ((chunked_prefill_enabled or not is_prompt)
+                  and block_tables is not None):
+                if curr_sliding_window_block == 0:
+                    block_table = block_tables[seq_id]
+                else:
+                    block_table = block_tables[seq_id][
+                        -curr_sliding_window_block:]
+            self.block_tables.append(block_table)
+
+            # Compute slot mapping.
+            is_profile_run = is_block_tables_empty(block_tables)
+            start_idx = compute_slot_mapping_start_idx(is_prompt, query_len,
+                                                       context_len,
+                                                       self.sliding_window)
+            compute_slot_mapping(is_profile_run, self.slot_mapping, seq_id,
+                                 seq_len, context_len, start_idx,
+                                 self.block_size, inter_data.block_tables)
+
+    def _get_graph_runner_block_tables(
+            self, num_seqs: int,
+            block_tables: List[List[int]]) -> torch.Tensor:
+        # The shape of graph_block_tables is
+        # [max batch size, max context len // block size].
+        max_batch_size, max_blocks = self.runner.graph_block_tables.shape
+        assert max_batch_size >= num_seqs
+
+        graph_block_tables = self.runner.graph_block_tables[:num_seqs]
+        for i, block_table in enumerate(block_tables):
+            if block_table:
+                num_blocks = len(block_table)
+                if num_blocks <= max_blocks:
+                    graph_block_tables[i, :num_blocks] = block_table
+                else:
+                    # It may be possible to have more blocks allocated due
+                    # to lookahead slots of multi-step, however, they are
+                    # not used anyway, so can be safely ignored.
+                    graph_block_tables[
+                        i, :max_blocks] = block_table[:max_blocks]
+
+        return torch.from_numpy(graph_block_tables).to(
+            device=self.runner.device, non_blocking=True)
+
+    def build(self, seq_lens: List[int], query_lens: List[int],
+              cuda_graph_pad_size: int, batch_size: int):
+        """Build attention metadata with on-device tensors.
+
+        Args:
+            seq_lens: The maybe padded sequence lengths of the input sequences.
+            query_lens: The query lengths of the input sequences.
+            cuda_graph_pad_size: The padding size for cuda graph.
+                                 -1 if cuda graph is not used.
+            batch_size: The maybe padded batch size.
+        """
+        prefix_cache_hit = any([
+            inter_data.prefix_cache_hit
+            for inter_data in self.input_builder.inter_data_list
+        ])
+        for inter_data in self.input_builder.inter_data_list:
+            self._add_seq_group(inter_data,
+                                self.input_builder.chunked_prefill_enabled,
+                                prefix_cache_hit)
+
+        device = self.runner.device
+        use_captured_graph = cuda_graph_pad_size != -1
+
+        max_query_len = max(query_lens)
+        decode_query_lens = query_lens[self.num_prefills:]
+        if len(decode_query_lens) > 0:
+            max_decode_query_len = max(decode_query_lens)
+        else:
+            max_decode_query_len = 1
+        max_prefill_seq_len = max(self.prefill_seq_lens, default=0)
+        max_decode_seq_len = max(self.curr_seq_lens, default=0)
+        num_decode_tokens = self.num_decode_tokens
+        query_start_loc = list(accumulate(query_lens, initial=0))
+        seq_start_loc = list(accumulate(seq_lens, initial=0))
+
+        num_seqs = len(seq_lens)
+        if use_captured_graph:
+            self.slot_mapping.extend([PAD_SLOT_ID] * cuda_graph_pad_size)
+            self.block_tables.extend([] * cuda_graph_pad_size)
+            num_decode_tokens = batch_size - self.num_prefill_tokens
+            block_tables = self._get_graph_runner_block_tables(
+                num_seqs, self.block_tables)
+        else:
+            block_tables = make_tensor_with_pad(
+                self.block_tables,
+                pad=0,
+                dtype=torch.int,
+                device=device,
+            )
+        assert max_query_len > 0, ("query_lens: {}".format(query_lens))
+
+        assert device is not None
+        context_lens_tensor = async_tensor_h2d(self.context_lens, torch.int,
+                                               device, self.runner.pin_memory)
+        seq_lens_tensor = async_tensor_h2d(seq_lens, torch.int, device,
+                                           self.runner.pin_memory)
+        input_positions = async_tensor_h2d(self.input_positions, torch.long,
+                                           device, self.runner.pin_memory)
+        slot_mapping_tensor = async_tensor_h2d(self.slot_mapping, torch.long,
+                                               device, self.runner.pin_memory)
+        query_start_loc_tensor = async_tensor_h2d(query_start_loc, torch.int32,
+                                                  device,
+                                                  self.runner.pin_memory)
+        seq_start_loc_tensor = async_tensor_h2d(seq_start_loc, torch.int32,
+                                                device, self.runner.pin_memory)
+        placeholder_index_maps = {
+            modality: placeholder_map.index_map()
+            for modality, placeholder_map in
+            self.multimodal_placeholder_maps.items()
+        }
+
+        return TritonMLAMetadata(
+            num_prefills=self.num_prefills,
+            slot_mapping=slot_mapping_tensor,
+            num_prefill_tokens=self.num_prefill_tokens,
+            num_decode_tokens=num_decode_tokens,
+            seq_lens=seq_lens,
+            multi_modal_placeholder_index_maps=placeholder_index_maps,
+            enable_kv_scales_calculation=True,
+            input_positions=input_positions,
+            seq_lens_tensor=seq_lens_tensor,
+            max_query_len=max_query_len,
+            max_decode_query_len=max_decode_query_len,
+            max_prefill_seq_len=max_prefill_seq_len,
+            max_decode_seq_len=max_decode_seq_len,
+            query_start_loc=query_start_loc_tensor,
+            seq_start_loc=seq_start_loc_tensor,
+            context_lens_tensor=context_lens_tensor,
+            block_tables=block_tables,
+            use_cuda_graph=use_captured_graph,
+            num_kv_splits=4,  # TODO(lucas) add heuristic
+            head_dim=self.runner.model_config.get_head_size(),
+        )
+
+
+class TritonMLAImpl(MLACommonImpl[TritonMLAMetadata]):
+
+    def __init__(
+            self,
+            num_heads: int,
+            head_size: int,
+            scale: float,
+            num_kv_heads: int,
+            alibi_slopes: Optional[List[float]],
+            sliding_window: Optional[int],
+            kv_cache_dtype: str,
+            blocksparse_params: Optional[Dict[str, Any]],
+            logits_soft_cap: Optional[float],
+            attn_type: str,
+            # MLA Specific Arguments
+            **kwargs) -> None:
+        super().__init__(num_heads, head_size, scale, num_kv_heads,
+                         alibi_slopes, sliding_window, kv_cache_dtype,
+                         blocksparse_params, logits_soft_cap, attn_type,
+                         **kwargs)
+
+        unsupported_features = [
+            alibi_slopes, sliding_window, blocksparse_params, logits_soft_cap
+        ]
+        if any(unsupported_features):
+            raise NotImplementedError(
+                "TritonMLAImpl does not support one of the following: "
+                "alibi_slopes, sliding_window, blocksparse_params, "
+                "logits_soft_cap")
+
+        if attn_type != AttentionType.DECODER:
+            raise NotImplementedError("Encoder self-attention and "
+                                      "encoder/decoder cross-attention "
+                                      "are not implemented for "
+                                      "TritonMLAImpl")
+
+    def _forward_prefill(
+        self,
+        q: torch.Tensor,
+        kv_c_normed: torch.Tensor,
+        k_pe: torch.Tensor,
+        attn_metadata: TritonMLAMetadata,
+    ) -> torch.Tensor:
+        assert isinstance(attn_metadata, TritonMLAMetadata)
+        return self._forward_prefill_flash(q, kv_c_normed, k_pe,
+                                           attn_metadata.seq_start_loc,
+                                           attn_metadata.max_prefill_seq_len)
+
+    def _forward_decode(
+        self,
+        q_nope: torch.Tensor,
+        q_pe: torch.Tensor,
+        kv_c_and_k_pe_cache: torch.Tensor,
+        attn_metadata: TritonMLAMetadata,
+    ) -> torch.Tensor:
+        assert kv_c_and_k_pe_cache.numel() > 0
+        if self.kv_cache_dtype.startswith("fp8"):
+            raise NotImplementedError("FP8 Triton MLA not yet supported")
+
+        decode_meta = attn_metadata.decode_metadata
+        assert decode_meta is not None
+        B = q_nope.shape[0]
+
+        q = torch.cat([q_nope, q_pe], dim=-1)
+        o = torch.zeros(B,
+                        self.num_heads,
+                        self.kv_lora_rank,
+                        dtype=q.dtype,
+                        device=q.device)
+
+        # TODO(lucas) Allocate ahead of time
+        attn_logits = torch.empty(
+            (
+                B,
+                self.num_heads,
+                attn_metadata.num_kv_splits,
+                # NOTE(lucas) idk why the +1 is here but sglang has it so we
+                # just mirror that
+                self.kv_lora_rank + 1,
+            ),
+            dtype=torch.float32,
+            device=q.device,
+        )
+
+        # Add a head dim of 1
+        kv_c_and_k_pe_cache = kv_c_and_k_pe_cache.unsqueeze(2)
+        kv_c_cache = kv_c_and_k_pe_cache[..., :self.kv_lora_rank]
+        PAGE_SIZE = kv_c_and_k_pe_cache.size(1)
+
+        # Run MQA
+        decode_attention_fwd(q, kv_c_and_k_pe_cache, kv_c_cache, o,
+                             decode_meta.block_tables,
+                             decode_meta.seq_lens_tensor, attn_logits,
+                             attn_metadata.num_kv_splits, self.scale,
+                             PAGE_SIZE)
+
+        return self._v_up_proj_and_o_proj(o)
diff --git a/vllm/attention/backends/utils.py b/vllm/attention/backends/utils.py
index 84fe89b7df360..ad53e4e70b0fa 100644
--- a/vllm/attention/backends/utils.py
+++ b/vllm/attention/backends/utils.py
@@ -1,3 +1,4 @@
+# SPDX-License-Identifier: Apache-2.0
 """Attention backend utils"""
 from collections import defaultdict
 from contextlib import contextmanager
@@ -289,7 +290,9 @@ class CommonAttentionState(AttentionState):
 
     @contextmanager
     def graph_capture(self, max_batch_size: int):
+
         self._is_graph_capturing = True
+
         self._graph_slot_mapping = torch.full((max_batch_size, ),
                                               PAD_SLOT_ID,
                                               dtype=torch.long,
@@ -299,7 +302,9 @@ class CommonAttentionState(AttentionState):
                                           device=self.runner.device)
         self._graph_block_tables = torch.from_numpy(
             self.runner.graph_block_tables).to(device=self.runner.device)
+
         yield
+
         self._is_graph_capturing = False
         del self._graph_slot_mapping
         del self._graph_seq_lens
diff --git a/vllm/attention/backends/xformers.py b/vllm/attention/backends/xformers.py
index 8c25dda7aad2c..723a4558d0b35 100644
--- a/vllm/attention/backends/xformers.py
+++ b/vllm/attention/backends/xformers.py
@@ -1,3 +1,4 @@
+# SPDX-License-Identifier: Apache-2.0
 """Attention layer with xFormers and PagedAttention."""
 from dataclasses import dataclass
 from typing import Any, Dict, List, Optional, Tuple, Type
@@ -199,6 +200,8 @@ class XFormersMetadata(AttentionMetadata, PagedAttentionMetadata):
         # Compute some attn_metadata fields which default to None
         query_start_loc = (None if self.query_start_loc is None else
                            self.query_start_loc[:self.num_prefills + 1])
+        seq_start_loc = (None if self.seq_start_loc is None else
+                         self.seq_start_loc[:self.num_prefills + 1])
         slot_mapping = (None if self.slot_mapping is None else
                         self.slot_mapping[:self.num_prefill_tokens])
         seq_lens = (None if self.seq_lens is None else
@@ -225,6 +228,7 @@ class XFormersMetadata(AttentionMetadata, PagedAttentionMetadata):
             max_prefill_seq_len=self.max_prefill_seq_len,
             max_decode_seq_len=0,
             query_start_loc=query_start_loc,
+            seq_start_loc=seq_start_loc,
             context_lens_tensor=context_lens_tensor,
             block_tables=block_tables,
             use_cuda_graph=False,
diff --git a/vllm/attention/layer.py b/vllm/attention/layer.py
index 79ea9b666c7e8..19ee89630ffa4 100644
--- a/vllm/attention/layer.py
+++ b/vllm/attention/layer.py
@@ -1,3 +1,4 @@
+# SPDX-License-Identifier: Apache-2.0
 """Attention layer."""
 from typing import Any, Dict, List, Optional
 
@@ -41,8 +42,10 @@ class Attention(nn.Module):
         blocksparse_params: Optional[Dict[str, Any]] = None,
         logits_soft_cap: Optional[float] = None,
         per_layer_sliding_window: Optional[int] = None,
+        use_mla: bool = False,
         prefix: str = "",
         attn_type: str = AttentionType.DECODER,
+        **extra_impl_args,
     ) -> None:
         super().__init__()
         if per_layer_sliding_window is not None:
@@ -101,13 +104,18 @@ class Attention(nn.Module):
         # During model initialization, the default dtype is set as the model
         # weight and activation dtype.
         dtype = torch.get_default_dtype()
-        attn_backend = get_attn_backend(head_size, dtype, kv_cache_dtype,
-                                        block_size, is_attention_free,
-                                        blocksparse_params is not None)
+        attn_backend = get_attn_backend(head_size,
+                                        dtype,
+                                        kv_cache_dtype,
+                                        block_size,
+                                        is_attention_free,
+                                        blocksparse_params is not None,
+                                        use_mla=use_mla)
         impl_cls = attn_backend.get_impl_cls()
         self.impl = impl_cls(num_heads, head_size, scale, num_kv_heads,
                              alibi_slopes, sliding_window, kv_cache_dtype,
-                             blocksparse_params, logits_soft_cap, attn_type)
+                             blocksparse_params, logits_soft_cap, attn_type,
+                             **extra_impl_args)
         self.num_heads = num_heads
         self.head_size = head_size
         self.num_kv_heads = num_kv_heads
@@ -193,6 +201,10 @@ class Attention(nn.Module):
         s += f", backend={self.impl.__class__.__name__}"
         return s
 
+    def process_weights_after_loading(self, act_dtype: torch.dtype):
+        if hasattr(self.impl, "process_weights_after_loading"):
+            self.impl.process_weights_after_loading(act_dtype)
+
 
 class MultiHeadAttention(nn.Module):
     """Multi-headed attention without any cache, used for ViT."""
@@ -210,6 +222,9 @@ class MultiHeadAttention(nn.Module):
         self.scale = scale
         self.num_kv_heads = num_heads if num_kv_heads is None else num_kv_heads
 
+        assert self.num_heads % self.num_kv_heads == 0
+        self.num_queries_per_kv = self.num_heads // self.num_kv_heads
+
         dtype = torch.get_default_dtype()
         attn_backend = get_attn_backend(head_size,
                                         dtype,
@@ -221,7 +236,8 @@ class MultiHeadAttention(nn.Module):
             backend = _Backend.XFORMERS
 
         self.attn_backend = backend if backend in {
-            _Backend.TORCH_SDPA, _Backend.XFORMERS
+            _Backend.TORCH_SDPA,
+            _Backend.XFORMERS,
         } else _Backend.TORCH_SDPA
 
     def forward(
@@ -231,7 +247,7 @@ class MultiHeadAttention(nn.Module):
         value: torch.Tensor,
     ) -> torch.Tensor:
         """Input shape: batch_size x seq_len x hidden_size"""
-        # TODO(Isotr0py): Use existing backend implementations and support FA2
+        # TODO(Isotr0py): Use existing backend implementations and support FA3
         bsz, q_len, _ = query.size()
         kv_len = key.size(1)
 
@@ -239,6 +255,11 @@ class MultiHeadAttention(nn.Module):
         key = key.view(bsz, kv_len, self.num_kv_heads, self.head_size)
         value = value.view(bsz, kv_len, self.num_kv_heads, self.head_size)
 
+        if (num_repeat := self.num_queries_per_kv) > 1:
+            # Handle MQA and GQA
+            key = torch.repeat_interleave(key, num_repeat, dim=2)
+            value = torch.repeat_interleave(value, num_repeat, dim=2)
+
         if self.attn_backend == _Backend.XFORMERS:
             from xformers import ops as xops
 
diff --git a/vllm/attention/ops/blocksparse_attention/blocksparse_attention_kernel.py b/vllm/attention/ops/blocksparse_attention/blocksparse_attention_kernel.py
index 727a470ba6d0e..71caf3cbac02c 100644
--- a/vllm/attention/ops/blocksparse_attention/blocksparse_attention_kernel.py
+++ b/vllm/attention/ops/blocksparse_attention/blocksparse_attention_kernel.py
@@ -1,3 +1,5 @@
+# SPDX-License-Identifier: Apache-2.0
+
 import torch
 import triton
 import triton.language as tl
diff --git a/vllm/attention/ops/blocksparse_attention/interface.py b/vllm/attention/ops/blocksparse_attention/interface.py
index 350f88c8f9740..6ab69ea5b4098 100644
--- a/vllm/attention/ops/blocksparse_attention/interface.py
+++ b/vllm/attention/ops/blocksparse_attention/interface.py
@@ -1,3 +1,5 @@
+# SPDX-License-Identifier: Apache-2.0
+
 import math
 
 import torch
diff --git a/vllm/attention/ops/blocksparse_attention/utils.py b/vllm/attention/ops/blocksparse_attention/utils.py
index 78d752230d6e7..4de9bd5306428 100644
--- a/vllm/attention/ops/blocksparse_attention/utils.py
+++ b/vllm/attention/ops/blocksparse_attention/utils.py
@@ -1,3 +1,5 @@
+# SPDX-License-Identifier: Apache-2.0
+
 # Helper functions for 3D sparse pattern
 # These function are not optimized and very inefficient.
 # Avoid calling them too frequent or use a cache mechanism.
diff --git a/vllm/attention/ops/hpu_paged_attn.py b/vllm/attention/ops/hpu_paged_attn.py
index 4c0fb2a628361..8bb536343ed8c 100644
--- a/vllm/attention/ops/hpu_paged_attn.py
+++ b/vllm/attention/ops/hpu_paged_attn.py
@@ -1,3 +1,5 @@
+# SPDX-License-Identifier: Apache-2.0
+
 ###############################################################################
 # Copyright (C) 2024 Habana Labs, Ltd. an Intel Company
 ###############################################################################
diff --git a/vllm/attention/ops/ipex_attn.py b/vllm/attention/ops/ipex_attn.py
index 3a07184ed31f0..598ceea130d97 100644
--- a/vllm/attention/ops/ipex_attn.py
+++ b/vllm/attention/ops/ipex_attn.py
@@ -1,3 +1,5 @@
+# SPDX-License-Identifier: Apache-2.0
+
 from typing import Dict, List, Optional, Tuple
 
 try:
diff --git a/vllm/attention/ops/nki_flash_attn.py b/vllm/attention/ops/nki_flash_attn.py
new file mode 100644
index 0000000000000..68aa63f5ac16c
--- /dev/null
+++ b/vllm/attention/ops/nki_flash_attn.py
@@ -0,0 +1,684 @@
+# SPDX-License-Identifier: Apache-2.0
+
+from dataclasses import dataclass
+
+import neuronxcc.nki.isa as nisa
+import neuronxcc.nki.language as nl
+import numpy as np
+from neuronxcc import nki
+from neuronxcc.nki.language import par_dim
+
+
+@dataclass(frozen=True)
+class FlashConfig:
+    """
+    Config class for flash attention with default values
+    """
+
+    seq_tile_size: int = 2048
+    should_transpose_v: bool = False
+
+    __annotations__ = {
+        "seq_tile_size": int,
+        "should_transpose_v": bool,
+    }
+
+
+@nki.jit
+def transpose_p_local(p_local_transposed,
+                      p_local,
+                      LARGE_TILE_SZ,
+                      forward_mask,
+                      B_F_SIZE=512):
+    for i in nl.affine_range(LARGE_TILE_SZ // B_F_SIZE):
+        if nisa.get_nc_version() == nisa.nc_version.gen3:
+            p_local_t_tmp = nl.ndarray((par_dim(128), B_F_SIZE),
+                                       buffer=nl.sbuf,
+                                       dtype=p_local.dtype)
+        else:
+            p_local_t_tmp = nl.ndarray((par_dim(128), B_F_SIZE),
+                                       buffer=nl.psum,
+                                       dtype=np.float32)
+
+        for j in nl.affine_range(B_F_SIZE // 128):
+            j_128_slice = nl.ds(j * 128, 128)
+            i_j_128_slice = nl.ds(i * B_F_SIZE + j * 128, 128)
+
+            if nisa.get_nc_version() == nisa.nc_version.gen3:
+                p_local_t_tmp[:, j_128_slice] = nisa.dma_transpose(
+                    p_local[:, i_j_128_slice], mask=forward_mask)
+            else:
+                p_local_t_tmp[:, j_128_slice] = nisa.nc_transpose(
+                    p_local[:, i_j_128_slice], mask=forward_mask)
+
+        p_local_transposed[:, nl.ds(i * B_F_SIZE, B_F_SIZE)] = nl.copy(
+            p_local_t_tmp, dtype=p_local_transposed.dtype, mask=forward_mask)
+
+
+@nki.jit
+def _flash_attention_core(
+    q_local_tile,
+    k,
+    v,
+    q_h_per_k_h,
+    seqlen_q,
+    nheads,
+    o_buffer,
+    l_buffer,
+    m_buffer,
+    batch_id,
+    head_id,
+    gqa_head_idx,
+    q_tile_idx,
+    local_k_large_tile_idx,
+    kernel_dtype,
+    acc_type,
+    flash_config: FlashConfig,
+    use_causal_mask=False,
+    continuous_batching_mask=None,
+    initialize=False,
+    B_P_SIZE=128,
+    B_F_SIZE=512,
+    B_D_SIZE=128,
+    dropout_p=0.0,
+    dropout_p_tensor=None,
+    seed_tensor=None,
+    logit_bias_tile=None,
+    qk_res_buffer=None,
+):
+    """
+    The flash attention core function to calculate self attention between a tile
+    of q and a block of K and V.
+    The q_local_tile has (B_P_SIZE, B_F_SIZE), which is loaded into the SBUF 
+    already. The block size of K and V
+    is defined in the seq_tile_size of the flash_config. The results are stored
+    in the following three buffers
+    o_buffer: (B_P_SIZE, d)
+    l_buffer: (B_P_SIZE, 1)
+    m_buffer: (B_P_SIZE, 1)
+    """
+    LARGE_TILE_SZ = flash_config.seq_tile_size
+    num_k_tile_per_large_tile = LARGE_TILE_SZ // B_F_SIZE
+    seqlen_k = k.shape[-1]
+    seqlen_q // B_P_SIZE
+    seqlen_k // B_F_SIZE
+
+    # TODO : support logit_bias with continuous_batching_mask
+    assert not use_causal_mask, "causal mask is not supported."
+    assert (continuous_batching_mask
+            is not None), "continuous_batching_mask input is required."
+    if continuous_batching_mask is not None:
+        assert (
+            logit_bias_tile
+            is None), "continuous_batching_mask does not support logit_bias!"
+
+    # mask are used to only apply computation to the lower half of the matrix,
+    # which reduce the arithmetic intensity by half
+    forward_mask = (q_tile_idx * B_P_SIZE >= local_k_large_tile_idx *
+                    LARGE_TILE_SZ if use_causal_mask else None)
+
+    qk_res_buf = nl.ndarray((par_dim(B_P_SIZE), LARGE_TILE_SZ),
+                            buffer=nl.sbuf,
+                            dtype=acc_type)
+    max_local = nl.ndarray((par_dim(B_P_SIZE), num_k_tile_per_large_tile),
+                           dtype=acc_type)
+    for k_i in nl.affine_range(num_k_tile_per_large_tile):
+        k_i_b_f_slice = nl.ds(k_i * B_F_SIZE, B_F_SIZE)
+
+        qk_psum = nl.zeros((par_dim(B_P_SIZE), B_F_SIZE),
+                           dtype=np.float32,
+                           buffer=nl.psum)  # (128, 512)
+        qk_psum[:, :] = nl.matmul(q_local_tile,
+                                  k[:, k_i_b_f_slice],
+                                  transpose_x=True,
+                                  mask=None)  # (p(128), 512)
+
+        qk_res_buf[:, k_i_b_f_slice] = nl.where(
+            continuous_batching_mask[:, k_i_b_f_slice],
+            qk_psum[:, nl.ds(0, B_F_SIZE)],
+            -9984.0,
+            dtype=acc_type,
+        )
+
+        # Calculate max of the current tile
+        max_local[:, k_i] = nisa.tensor_reduce(
+            np.max,
+            qk_res_buf[:, k_i_b_f_slice],
+            axis=(1, ),
+            dtype=acc_type,
+            negate=False,
+            mask=forward_mask,
+        )
+
+    if qk_res_buffer is not None:
+        qk_res_buffer[:, :] = nl.copy(qk_res_buf[:, :])
+
+    max_ = nisa.tensor_reduce(
+        np.max,
+        max_local[:, :],
+        axis=(1, ),
+        dtype=acc_type,
+        negate=False,
+        mask=forward_mask,
+    )
+
+    o_previous_scaled = nl.ndarray((par_dim(B_P_SIZE), B_D_SIZE),
+                                   dtype=o_buffer.dtype)
+
+    if initialize:
+        m_buffer[:, 0] = nl.copy(max_)
+        m_current = max_
+    else:
+        m_previous = nl.copy(m_buffer[:, 0])
+        m_buffer[:, 0] = nl.maximum(m_previous, max_,
+                                    mask=forward_mask)  # (128,1)
+
+        m_current = m_buffer[:, 0]
+        # Compute scaling factor
+        alpha = nisa.activation(
+            np.exp,
+            m_previous,
+            bias=-1 * m_current,
+            scale=1.0,
+            mask=forward_mask,
+        )
+        o_previous_scaled[...] = nl.multiply(o_buffer[:, :],
+                                             alpha,
+                                             mask=forward_mask)
+
+    p_local = nl.ndarray((par_dim(B_P_SIZE), LARGE_TILE_SZ),
+                         dtype=kernel_dtype)
+    REDUCTION_TILE = min(2048, LARGE_TILE_SZ // 2)
+
+    p_partial_sum = nl.ndarray(
+        (par_dim(B_P_SIZE), LARGE_TILE_SZ // REDUCTION_TILE), dtype=acc_type)
+
+    for k_r_i in nl.affine_range(LARGE_TILE_SZ // REDUCTION_TILE):
+        k_r_i_reduce_slice = nl.ds(k_r_i * REDUCTION_TILE, REDUCTION_TILE)
+
+        # compute exp(qk - max)
+        # Compute partial row - tile sum of exp(qk - max))
+        # FIXME : Use activation accumulate to accumulate over k_r_i loop ?
+        p_local[:, k_r_i_reduce_slice] = nisa.activation_reduce(
+            np.exp,
+            qk_res_buf[:, k_r_i_reduce_slice],
+            bias=-1 * m_current,
+            scale=1.0,
+            reduce_op=nl.add,
+            reduce_res=p_partial_sum[:, k_r_i],
+            dtype=kernel_dtype,
+            mask=forward_mask,
+        )
+
+    ps = nl.sum(p_partial_sum, axis=1, dtype=acc_type, mask=forward_mask)
+
+    p_local_transposed = nl.ndarray((par_dim(B_P_SIZE), LARGE_TILE_SZ),
+                                    dtype=kernel_dtype)
+    transpose_p_local(
+        p_local_transposed=p_local_transposed,
+        p_local=p_local,
+        LARGE_TILE_SZ=LARGE_TILE_SZ,
+        forward_mask=forward_mask,
+        B_F_SIZE=B_F_SIZE,
+    )
+
+    pv_psum = nl.zeros((par_dim(B_P_SIZE), B_D_SIZE),
+                       dtype=np.float32,
+                       buffer=nl.psum)
+    for k_i in nl.affine_range(LARGE_TILE_SZ // B_P_SIZE):
+        pv_psum[:, :] += nl.matmul(
+            p_local_transposed[:, nl.ds(k_i * B_P_SIZE, B_P_SIZE)],
+            v[k_i, :, :],
+            transpose_x=True,
+            mask=forward_mask,
+        )  # (128, 128) (p(Br), d)
+
+    if initialize:
+        o_buffer[:, :] = nl.copy(pv_psum[:, :])
+        l_buffer[:, 0] = nl.add(nl.log(ps), max_)
+    else:
+        o_buffer[:, :] = nl.add(o_previous_scaled, pv_psum, mask=forward_mask)
+
+        l_prev = l_buffer[:, 0]
+        l_exp = nl.add(
+            nl.exp(
+                nl.subtract(l_prev, m_current, mask=forward_mask),
+                mask=forward_mask,
+            ),
+            ps,
+            mask=forward_mask,
+        )
+        l_buffer[:, 0] = nl.add(m_current,
+                                nl.log(l_exp, mask=forward_mask),
+                                mask=forward_mask)
+
+
+@nki.jit
+def load_v_tile(v_hbm_tile, cur_v_tile, j, v_i, config):
+    LARGE_TILE_SZ = config.seq_tile_size
+    B_P_SIZE = 128
+
+    if not config.should_transpose_v:
+        cur_v_tile[v_i, :, :] = nl.load(
+            v_hbm_tile[nl.ds(j * LARGE_TILE_SZ + B_P_SIZE * v_i, B_P_SIZE), :],
+            dtype=cur_v_tile.dtype,
+        )
+        return
+
+    if nisa.get_nc_version() == nisa.nc_version.gen3:
+        cur_v_tile_transposed = nisa.dma_transpose(
+            v_hbm_tile[:,
+                       nl.ds(j * LARGE_TILE_SZ + B_P_SIZE * v_i, B_P_SIZE)])
+        cur_v_tile[v_i, :, :] = nisa.tensor_copy(cur_v_tile_transposed,
+                                                 dtype=cur_v_tile.dtype)
+        return
+
+    cur_v_tile[v_i, :, :] = nl.load_transpose2d(
+        v_hbm_tile[:, nl.ds(j * LARGE_TILE_SZ + B_P_SIZE * v_i, B_P_SIZE)],
+        dtype=cur_v_tile.dtype,
+    )
+
+
+@nki.jit
+def flash_paged_attention(
+    query,
+    key,
+    value,
+    key_cache,
+    value_cache,
+    block_tables,
+    mask,
+    softmax_scale=None,
+    mixed_precision=True,
+    config=None,
+    return_debug_tensors=False,
+):
+    """
+    Flash PagedAttention Forward Kernel.
+      - PagedAttention Paper: https://arxiv.org/abs/2309.06180
+      - Chunked Prefill Paper: https://arxiv.org/abs/2403.02310
+
+    IO tensor layouts:
+      - query: shape   (1, n_heads, d, seq_q)
+      - key:   shape   (1, n_kv_heads, d, seq_k)
+      - value: shape   (1, n_kv_heads, seq_v, d)
+      - key_cache: (num_blocks, block_size, n_kv_heads, d)
+      - value_cache: (num_blocks, block_size, n_kv_heads, d)
+      - block_tables: (num_active_blocks, )
+      - mask: (seq_q, num_active_blocks * block_size)
+      - o: shape (1, n_heads, seq_q, d)
+      - l_m: shape (1, n_heads, seq_q, 2)
+
+      - This kernel requires seq_k == seq_v
+      - We use continuous batching by default, so the batch dimension is
+        always 1, and different requests are concatenated along sequence
+        dimension.
+      - We use paged cache blocks (key_cache, value_cache) to store KV cache.
+
+    IO tensor dtypes:
+      - This kernel assumes all IO tensors have the same dtype except for 
+        block_tables (int32) and mask (int32)
+      - If mixed_percision is True, then all Tensor Engine operation will be 
+        performed in bfloat16 and accumulation will be performed in float32. 
+        Otherwise the intermediates will be in the same type as the inputs.
+
+    Compile-time Constants:
+      - softmax_scale: scaling for softmax, is None, default is `1.0/(d**0.5)`
+      - mixed_precision: flag to set non-matmul ops in fp32 precision, default
+        is set to `true`, if false, we use same precision as input types 
+      - config: Instance of dataclass :class:`nki.kernels.attention.FlashConfig`
+          with Performance config parameters for flash attention with default
+          values
+        seq_tile_size: `default=2048`, size of the kv tile size for attention 
+          computation reduction
+
+    GQA support Notes:
+      the spmd kernel for launching kernel should be on kv_heads instead of 
+      nheads
+
+    Example usage:
+      MHA: q: [b, h, d, s], k: [b, h, d, s], v: [b, h, s, d]
+        usage: `flash_fwd[b, h](q, k, v, ...)`
+      GQA: q: [b, h, d, s], k: [b, kv_h, d, s], v: [b, kv_h, s, d]
+        usage: `flash_fwd[b, kv_h](q, k, v, ...)`
+    """
+    config = config or FlashConfig()
+    B_F_SIZE = 512
+    B_P_SIZE = 128
+    b, h, d, seqlen_q = query.shape
+    B_D_SIZE = d
+    LARGE_TILE_SZ = config.seq_tile_size
+    n_tile_q = seqlen_q // B_P_SIZE  # since q will be loaded on tensor engine
+    num_blocks, block_size, k_h, _ = key_cache.shape
+    q_h_per_k_h = h // k_h
+    assert tuple(key_cache.shape) == (
+        num_blocks,
+        block_size,
+        k_h,
+        d,
+    ), "Input shape mismatch!"
+    assert tuple(value_cache.shape) == (
+        num_blocks,
+        block_size,
+        k_h,
+        d,
+    ), "Input shape mismatch!"
+    assert b == 1, f"invalid batch size {b=}"
+    assert d <= 128, f" we do not support head_dim > 128, got head dim {d}"
+    kernel_dtype = nl.bfloat16 if mixed_precision else query.dtype
+    acc_type = np.dtype(np.float32) if mixed_precision else kernel_dtype
+
+    o = nl.ndarray((b, h, seqlen_q, d),
+                   dtype=query.dtype,
+                   buffer=nl.shared_hbm)
+    hbm_l_buffer, hbm_m_buffer, hbm_qk_res, qk_res_buffer = (
+        None,
+        None,
+        None,
+        None,
+    )
+    if return_debug_tensors:
+        hbm_l_buffer = nl.ndarray((b, h, seqlen_q),
+                                  dtype=acc_type,
+                                  buffer=nl.shared_hbm)
+        hbm_m_buffer = nl.ndarray((b, h, seqlen_q),
+                                  dtype=acc_type,
+                                  buffer=nl.shared_hbm)
+        hbm_qk_res = nl.ndarray((b, h, B_P_SIZE, seqlen_q),
+                                dtype=acc_type,
+                                buffer=nl.shared_hbm)
+        qk_res_buffer = nl.zeros(
+            (n_tile_q, q_h_per_k_h, par_dim(B_P_SIZE), seqlen_q),
+            dtype=acc_type,
+            buffer=nl.sbuf,
+            lazy_initialization=True,
+        )
+
+    assert (
+        nl.program_ndim() == 2
+    ), f"Expect spmd grid with 2 dimensions, got {nl.program_ndim()} instead!"
+    batch_id = nl.program_id(axis=0)
+    head_id = nl.program_id(axis=1)
+
+    softmax_scale = softmax_scale or (1.0 / (d**0.5))
+
+    (num_active_blocks, ) = block_tables.shape
+    context_kv_len = num_active_blocks * block_size
+    assert (config.seq_tile_size >= 512
+            ), f" seq tile_size {config.seq_tile_size} cannot be less than 512"
+    assert (context_kv_len % LARGE_TILE_SZ == 0
+            ), f"Need {context_kv_len=} to be divisible by {LARGE_TILE_SZ=}"
+    assert (
+        LARGE_TILE_SZ % B_P_SIZE == 0
+    ), f"Need LARGE_TILE_SZ ({LARGE_TILE_SZ}) to be divisible by {B_P_SIZE=}"
+    assert (B_P_SIZE % block_size == 0
+            ), f"Need B_P_SIZE ({B_P_SIZE}) to be divisible by {block_size=}"
+    num_large_k_tile = context_kv_len // LARGE_TILE_SZ
+    num_blocks_per_large_tile = LARGE_TILE_SZ // block_size
+    assert (num_blocks_per_large_tile <= B_P_SIZE
+    ), f"The number of blocks in each large tile " \
+    f"({num_blocks_per_large_tile}) shouldn't exceed partition size {B_P_SIZE}"
+
+    block_tables_sbuf = nl.full((par_dim(B_P_SIZE), num_large_k_tile),
+                                0,
+                                dtype=np.int32,
+                                buffer=nl.sbuf)
+    for j in nl.affine_range(num_large_k_tile):
+        i_p = nl.arange(num_blocks_per_large_tile)[:, None]
+        block_tables_sbuf[i_p, j] = nl.load(
+            block_tables[j * num_blocks_per_large_tile + i_p], dtype=np.int32)
+
+    # Global Flash Attention accumulators
+    o_buffer = nl.zeros(
+        (n_tile_q, q_h_per_k_h, par_dim(B_P_SIZE), d),
+        dtype=acc_type,
+        buffer=nl.sbuf,
+        lazy_initialization=True,
+    )
+    l_buffer = nl.zeros(
+        (par_dim(B_P_SIZE), n_tile_q, q_h_per_k_h),
+        dtype=acc_type,
+        buffer=nl.sbuf,
+        lazy_initialization=True,
+    )
+    m_buffer = nl.zeros(
+        (n_tile_q, q_h_per_k_h, par_dim(B_P_SIZE), 1),
+        dtype=acc_type,
+        buffer=nl.sbuf,
+        lazy_initialization=True,
+    )
+
+    for j in nl.sequential_range(0, num_large_k_tile):
+        cur_k_tile = nl.ndarray((par_dim(B_D_SIZE), LARGE_TILE_SZ),
+                                dtype=kernel_dtype)
+        cur_v_tile = nl.ndarray(
+            (LARGE_TILE_SZ // B_P_SIZE, par_dim(B_P_SIZE), B_D_SIZE),
+            dtype=kernel_dtype,
+        )
+
+        for k_i in nl.affine_range(num_blocks_per_large_tile):
+            loaded = nl.load(key_cache[block_tables_sbuf[k_i, j], :,
+                                       head_id, :])
+            cur_k_tile[:, nl.ds(k_i *
+                                block_size, block_size)] = nl.transpose(loaded)
+
+        load_tile_size = B_P_SIZE
+        num_blocks_per_partition = load_tile_size // block_size
+        for partition_idx in nl.affine_range(LARGE_TILE_SZ // load_tile_size):
+            for block_in_partition in nl.affine_range(
+                    num_blocks_per_partition):
+                v_i = (partition_idx * num_blocks_per_partition +
+                       block_in_partition)
+                loaded_v = nl.load(value_cache[block_tables_sbuf[v_i, j], :,
+                                               head_id, :])
+                cur_v_tile[
+                    partition_idx,
+                    nl.ds(block_in_partition * block_size, block_size),
+                    :,
+                ] = loaded_v
+
+        cur_mask = nl.ndarray((par_dim(B_P_SIZE), LARGE_TILE_SZ),
+                              dtype=mask.dtype)
+        for m_i in nl.affine_range(LARGE_TILE_SZ // B_F_SIZE):
+            cur_mask[:, nl.ds(m_i * B_F_SIZE, B_F_SIZE)] = nl.load(
+                mask[:, nl.ds(j * LARGE_TILE_SZ + m_i * B_F_SIZE, B_F_SIZE)])
+
+        for i_q_h in nl.affine_range(q_h_per_k_h):
+            for i in nl.affine_range(n_tile_q):
+                q_tile = nl.ndarray((B_D_SIZE, B_P_SIZE), dtype=kernel_dtype)
+                q_hbm_tile = query[batch_id, head_id * q_h_per_k_h + i_q_h]
+                q_sbuf_tile = nl.load(
+                    q_hbm_tile[:, nl.ds(i * B_P_SIZE, B_P_SIZE)],
+                    dtype=kernel_dtype,
+                )  # load (d, 128) tile in SBUF
+                q_tile[:, :] = q_sbuf_tile * softmax_scale
+
+                _flash_attention_core(
+                    q_local_tile=q_tile,
+                    k=cur_k_tile,
+                    v=cur_v_tile,
+                    q_h_per_k_h=q_h_per_k_h,
+                    seqlen_q=seqlen_q,
+                    nheads=h,
+                    o_buffer=o_buffer[i, i_q_h],
+                    l_buffer=l_buffer[:, i, i_q_h],
+                    m_buffer=m_buffer[i, i_q_h],
+                    batch_id=batch_id,
+                    head_id=head_id,
+                    gqa_head_idx=i_q_h,
+                    q_tile_idx=i,
+                    local_k_large_tile_idx=j,
+                    kernel_dtype=kernel_dtype,
+                    acc_type=acc_type,
+                    flash_config=config,
+                    use_causal_mask=False,
+                    continuous_batching_mask=cur_mask,
+                    initialize=j == 0,
+                    B_P_SIZE=B_P_SIZE,
+                    B_F_SIZE=B_F_SIZE,
+                    B_D_SIZE=B_D_SIZE,
+                    dropout_p=0.0,
+                    dropout_p_tensor=None,
+                    seed_tensor=None,
+                    logit_bias_tile=None,
+                )
+
+    # compute attention between input query, key and value
+    if key is not None and value is not None:
+        B_F_SIZE = seqlen_q
+        LARGE_TILE_SZ = seqlen_q
+        active_config = FlashConfig(
+            seq_tile_size=LARGE_TILE_SZ,
+            should_transpose_v=config.should_transpose_v,
+        )
+
+        cur_k_tile = nl.ndarray((par_dim(B_D_SIZE), LARGE_TILE_SZ),
+                                dtype=kernel_dtype)
+        cur_v_tile = nl.ndarray(
+            (LARGE_TILE_SZ // B_P_SIZE, par_dim(B_P_SIZE), B_D_SIZE),
+            dtype=kernel_dtype,
+        )
+
+        cur_k_tile[:, :] = nl.load(key[batch_id, head_id, :, :])
+
+        load_tile_size = B_P_SIZE
+        v_hbm_tile = value[batch_id, head_id]
+        for v_i in nl.affine_range(LARGE_TILE_SZ // load_tile_size):
+            load_v_tile(
+                v_hbm_tile=v_hbm_tile,
+                cur_v_tile=cur_v_tile,
+                j=0,
+                v_i=v_i,
+                config=active_config,
+            )
+
+        cur_mask = nl.ndarray((par_dim(B_P_SIZE), B_F_SIZE), dtype=mask.dtype)
+        cur_mask[:, :] = nl.load(mask[:, nl.ds(context_kv_len, B_F_SIZE)])
+
+        for i_q_h in nl.affine_range(q_h_per_k_h):
+            for i in nl.affine_range(n_tile_q):
+                q_tile = nl.ndarray((B_D_SIZE, B_P_SIZE), dtype=kernel_dtype)
+                q_hbm_tile = query[batch_id, head_id * q_h_per_k_h + i_q_h]
+                q_sbuf_tile = nl.load(
+                    q_hbm_tile[:, nl.ds(i * B_P_SIZE, B_P_SIZE)],
+                    dtype=kernel_dtype,
+                )  # load (d, 128) tile in SBUF
+                q_tile[:, :] = q_sbuf_tile * softmax_scale
+                _flash_attention_core(
+                    q_local_tile=q_tile,
+                    k=cur_k_tile,
+                    v=cur_v_tile,
+                    q_h_per_k_h=q_h_per_k_h,
+                    seqlen_q=seqlen_q,
+                    nheads=h,
+                    o_buffer=o_buffer[i, i_q_h],
+                    l_buffer=l_buffer[:, i, i_q_h],
+                    m_buffer=m_buffer[i, i_q_h],
+                    batch_id=batch_id,
+                    head_id=head_id,
+                    gqa_head_idx=i_q_h,
+                    q_tile_idx=i,
+                    local_k_large_tile_idx=0,
+                    kernel_dtype=kernel_dtype,
+                    acc_type=acc_type,
+                    flash_config=active_config,
+                    use_causal_mask=False,
+                    continuous_batching_mask=cur_mask,
+                    initialize=False,
+                    B_P_SIZE=B_P_SIZE,
+                    B_F_SIZE=B_F_SIZE,
+                    B_D_SIZE=B_D_SIZE,
+                    dropout_p=0.0,
+                    dropout_p_tensor=None,
+                    seed_tensor=None,
+                    logit_bias_tile=None,
+                    qk_res_buffer=qk_res_buffer[i, i_q_h]
+                    if qk_res_buffer is not None else None,
+                )
+
+    # -- -- -- -- write output to buffer on HBM -- -- -- -- -- -- #
+    for i_q_h in nl.affine_range(q_h_per_k_h):
+        for i in nl.affine_range(n_tile_q):
+            out = nl.multiply(
+                o_buffer[i, i_q_h, :, :],
+                nl.exp(m_buffer[i, i_q_h, :, :] - l_buffer[:, i, i_q_h]),
+                dtype=kernel_dtype,
+            )
+
+            nl.store(
+                o[
+                    batch_id,
+                    head_id * q_h_per_k_h + i_q_h,
+                    nl.ds(i * B_P_SIZE, B_P_SIZE),
+                    :,
+                ],
+                out,
+            )
+            # maximum and summation statistics
+            if return_debug_tensors:
+                nl.store(
+                    hbm_m_buffer[
+                        batch_id,
+                        head_id * q_h_per_k_h + i_q_h,
+                        nl.ds(i * B_P_SIZE, B_P_SIZE),
+                    ],
+                    m_buffer[i, i_q_h, :, :],
+                )
+                nl.store(
+                    hbm_l_buffer[
+                        batch_id,
+                        head_id * q_h_per_k_h + i_q_h,
+                        nl.ds(i * B_P_SIZE, B_P_SIZE),
+                    ],
+                    l_buffer[:, i, i_q_h],
+                )
+                nl.store(
+                    hbm_qk_res[batch_id, head_id * q_h_per_k_h + i_q_h, :, :],
+                    qk_res_buffer[batch_id, i_q_h, :, :],
+                )
+
+    if return_debug_tensors:
+        return o, hbm_m_buffer, hbm_l_buffer, hbm_qk_res
+    return o
+
+
+def flash_attn_varlen_nkifunc(
+    query,
+    key,
+    value,
+    key_cache,
+    value_cache,
+    block_table,
+    attn_mask,
+    n_kv_head=None,
+    head_size=None,
+    B_P_SIZE=128,
+    LARGE_TILE_SZ=2048,
+    return_debug_tensors=False,
+    mixed_precision=True,
+):
+    config = FlashConfig(
+        seq_tile_size=LARGE_TILE_SZ,
+        should_transpose_v=False,
+    )
+    kwargs = dict(
+        query=query,
+        key=key,
+        value=value,
+        key_cache=key_cache,
+        value_cache=value_cache,
+        block_tables=block_table,
+        mask=attn_mask,
+        softmax_scale=1.0 / (head_size**0.5),
+        config=config,
+        mixed_precision=mixed_precision,
+        return_debug_tensors=return_debug_tensors,
+    )
+    _, n_kv_head, _, _ = key.shape
+
+    if return_debug_tensors:
+        o, *debug_tensors = flash_paged_attention[1, n_kv_head](**kwargs)
+        return o, *debug_tensors
+    else:
+        o = flash_paged_attention[1, n_kv_head](**kwargs)
+        return o
diff --git a/vllm/attention/ops/paged_attn.py b/vllm/attention/ops/paged_attn.py
index fd62329141f6f..2c60bd0c38d66 100644
--- a/vllm/attention/ops/paged_attn.py
+++ b/vllm/attention/ops/paged_attn.py
@@ -1,3 +1,5 @@
+# SPDX-License-Identifier: Apache-2.0
+
 from dataclasses import dataclass
 from typing import List, Optional, Tuple
 
diff --git a/vllm/attention/ops/prefix_prefill.py b/vllm/attention/ops/prefix_prefill.py
index e2f2b66dfc90c..fbb6757ee3043 100644
--- a/vllm/attention/ops/prefix_prefill.py
+++ b/vllm/attention/ops/prefix_prefill.py
@@ -1,3 +1,5 @@
+# SPDX-License-Identifier: Apache-2.0
+
 # The kernels in this file are adapted from LightLLM's context_attention_fwd:
 # https://github.com/ModelTC/lightllm/blob/main/lightllm/models/llama/triton_kernel/context_flashattention_nopad.py
 
@@ -219,8 +221,8 @@ if triton.__version__ >= "2.1.0":
                           float("-inf"))
             if SLIDING_WINDOW > 0:
                 qk = tl.where(
-                    offs_m[:, None] -
-                    (start_n + offs_n[None, :]) < SLIDING_WINDOW, qk, -10000)
+                    offs_m[:, None] - (start_n + offs_n[None, :])
+                    < SLIDING_WINDOW, qk, -10000)
 
             # -- compute m_ij, p, l_ij
             m_ij = tl.max(qk, 1)
@@ -324,10 +326,10 @@ if triton.__version__ >= "2.1.0":
             (cur_batch_in_all_start_index + offs_m[:, None]) * stride_qbs +
             cur_head * stride_qh + offs_d[None, :] * stride_qd)
 
-        q = tl.load(
-            Q + off_q,
-            mask=offs_m[:, None] < cur_batch_seq_len - cur_batch_ctx_len,
-            other=0.0)
+        q = tl.load(Q + off_q,
+                    mask=offs_m[:, None]
+                    < cur_batch_seq_len - cur_batch_ctx_len,
+                    other=0.0)
 
         # # initialize pointer to m and l
         m_i = tl.zeros([BLOCK_M], dtype=tl.float32) - float("inf")
@@ -402,8 +404,8 @@ if triton.__version__ >= "2.1.0":
             # -- compute qk ----
             k = tl.load(k_ptrs +
                         (cur_batch_in_all_start_index + start_n) * stride_kbs,
-                        mask=(start_n + offs_n[None, :]) <
-                        cur_batch_seq_len - cur_batch_ctx_len,
+                        mask=(start_n + offs_n[None, :])
+                        < cur_batch_seq_len - cur_batch_ctx_len,
                         other=0.0)
 
             qk = tl.zeros([BLOCK_M, BLOCK_N], dtype=tl.float32)
@@ -430,8 +432,8 @@ if triton.__version__ >= "2.1.0":
             # update acc
             v = tl.load(v_ptrs +
                         (cur_batch_in_all_start_index + start_n) * stride_vbs,
-                        mask=(start_n + offs_n[:, None]) <
-                        cur_batch_seq_len - cur_batch_ctx_len,
+                        mask=(start_n + offs_n[:, None])
+                        < cur_batch_seq_len - cur_batch_ctx_len,
                         other=0.0)
 
             p = p.to(v.dtype)
@@ -639,8 +641,8 @@ if triton.__version__ >= "2.1.0":
             k = tl.load(k_ptrs +
                         (cur_batch_in_all_start_index + start_n) * stride_kbs,
                         mask=dim_mask[:, None] &
-                        ((start_n + offs_n[None, :]) <
-                         cur_batch_seq_len - cur_batch_ctx_len),
+                        ((start_n + offs_n[None, :])
+                         < cur_batch_seq_len - cur_batch_ctx_len),
                         other=0.0)
 
             qk = tl.zeros([BLOCK_M, BLOCK_N], dtype=tl.float32)
@@ -677,8 +679,8 @@ if triton.__version__ >= "2.1.0":
             v = tl.load(v_ptrs +
                         (cur_batch_in_all_start_index + start_n) * stride_vbs,
                         mask=dim_mask[None, :] &
-                        ((start_n + offs_n[:, None]) <
-                         cur_batch_seq_len - cur_batch_ctx_len),
+                        ((start_n + offs_n[:, None])
+                         < cur_batch_seq_len - cur_batch_ctx_len),
                         other=0.0)
             p = p.to(v.dtype)
 
diff --git a/vllm/attention/ops/triton_decode_attention.py b/vllm/attention/ops/triton_decode_attention.py
new file mode 100644
index 0000000000000..ec5ec4ce6e6b4
--- /dev/null
+++ b/vllm/attention/ops/triton_decode_attention.py
@@ -0,0 +1,669 @@
+# SPDX-License-Identifier: Apache-2.0
+
+# Adapted from
+# https://github.com/sgl-project/sglang/blob/9f635ea50de920aa507f486daafba26a5b837574/python/sglang/srt/layers/attention/triton_ops/decode_attention.py
+# which was originally adapted from
+# https://github.com/ModelTC/lightllm/blob/96353e868a840db4d103138caf15ed9dbea8c186/lightllm/models/deepseek2/triton_kernel/gqa_flash_decoding_stage1.py
+# https://github.com/ModelTC/lightllm/blob/96353e868a840db4d103138caf15ed9dbea8c186/lightllm/models/deepseek2/triton_kernel/gqa_flash_decoding_stage2.py
+
+# Changes:
+# - Add support for page size >= 1.
+
+# Copyright 2025 vLLM Team
+# Copyright 2023-2024 SGLang Team
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""
+Memory-efficient attention for decoding.
+It supports page size >= 1.
+"""
+
+import logging
+
+import triton
+import triton.language as tl
+
+from vllm.platforms import current_platform
+
+is_hip_ = current_platform.is_rocm()
+
+logger = logging.getLogger(__name__)
+
+# TODO: Remove this when triton>=3.2.0. This issue will not affect performance
+# and accuracy.
+logger.warning(
+    "The following error message 'operation scheduled before its operands' "
+    "can be ignored.")
+
+
+@triton.jit
+def tanh(x):
+    # Tanh is just a scaled sigmoid
+    return 2 * tl.sigmoid(2 * x) - 1
+
+
+@triton.jit
+def _fwd_kernel_stage1(
+    Q,
+    K_Buffer,
+    V_Buffer,
+    sm_scale,
+    Req_to_tokens,
+    B_Seqlen,
+    Att_Out,
+    stride_req_to_tokens_b,
+    stride_qbs,
+    stride_qh,
+    stride_buf_kbs,
+    stride_buf_kh,
+    stride_buf_vbs,
+    stride_buf_vh,
+    stride_mid_ob,
+    stride_mid_oh,
+    stride_mid_os,
+    kv_group_num: tl.constexpr,
+    BLOCK_DMODEL: tl.constexpr,
+    BLOCK_DV: tl.constexpr,
+    BLOCK_N: tl.constexpr,
+    NUM_KV_SPLITS: tl.constexpr,
+    PAGE_SIZE: tl.constexpr,
+    logit_cap: tl.constexpr,
+    Lk: tl.constexpr,
+    Lv: tl.constexpr,
+):
+    cur_batch = tl.program_id(0)
+    cur_head = tl.program_id(1)
+    split_kv_id = tl.program_id(2)
+
+    cur_kv_head = cur_head // kv_group_num
+
+    offs_d = tl.arange(0, BLOCK_DMODEL)
+    offs_dv = tl.arange(0, BLOCK_DV)
+    mask_d = offs_d < Lk
+    mask_dv = offs_dv < Lv
+    cur_batch_seq_len = tl.load(B_Seqlen + cur_batch)
+    cur_batch_req_idx = cur_batch
+
+    off_q = cur_batch * stride_qbs + cur_head * stride_qh + offs_d
+    q = tl.load(Q + off_q, mask=mask_d, other=0.0)
+
+    kv_len_per_split = tl.cdiv(cur_batch_seq_len, NUM_KV_SPLITS)
+    split_kv_start = kv_len_per_split * split_kv_id
+    split_kv_end = tl.minimum(split_kv_start + kv_len_per_split,
+                              cur_batch_seq_len)
+
+    e_max = -float("inf")
+    e_sum = 0.0
+    acc = tl.zeros([BLOCK_DV], dtype=tl.float32)
+
+    if split_kv_end > split_kv_start:
+        for start_n in range(split_kv_start, split_kv_end, BLOCK_N):
+            offs_n = start_n + tl.arange(0, BLOCK_N)
+            kv_page_number = tl.load(
+                Req_to_tokens + stride_req_to_tokens_b * cur_batch_req_idx +
+                offs_n // PAGE_SIZE,
+                mask=offs_n < split_kv_end,
+                other=0,
+            )
+            kv_loc = kv_page_number * PAGE_SIZE + offs_n % PAGE_SIZE
+            offs_buf_k = (kv_loc[:, None] * stride_buf_kbs +
+                          cur_kv_head * stride_buf_kh + offs_d[None, :])
+            k = tl.load(
+                K_Buffer + offs_buf_k,
+                mask=(offs_n[:, None] < split_kv_end) & (mask_d[None, :]),
+                other=0.0,
+            )
+            qk = tl.sum(q[None, :] * k, 1)
+            qk *= sm_scale
+
+            if logit_cap > 0:
+                qk = logit_cap * tanh(qk / logit_cap)
+
+            qk = tl.where(offs_n < split_kv_end, qk, float("-inf"))
+
+            offs_buf_v = (kv_loc[:, None] * stride_buf_vbs +
+                          cur_kv_head * stride_buf_vh + offs_dv[None, :])
+            v = tl.load(
+                V_Buffer + offs_buf_v,
+                mask=(offs_n[:, None] < split_kv_end) & (mask_dv[None, :]),
+                other=0.0,
+            )
+
+            n_e_max = tl.maximum(tl.max(qk, 0), e_max)
+            re_scale = tl.exp(e_max - n_e_max)
+            p = tl.exp(qk - n_e_max)
+            acc *= re_scale
+            acc += tl.sum(p[:, None] * v, 0)
+
+            e_sum = e_sum * re_scale + tl.sum(p, 0)
+            e_max = n_e_max
+
+        offs_mid_o = (cur_batch * stride_mid_ob + cur_head * stride_mid_oh +
+                      split_kv_id * stride_mid_os + offs_dv)
+
+        tl.store(
+            Att_Out + offs_mid_o,
+            acc / e_sum,
+            mask=(mask_dv),
+        )
+
+        offs_mid_o_1 = (cur_batch * stride_mid_ob + cur_head * stride_mid_oh +
+                        split_kv_id * stride_mid_os + Lv)
+
+        tl.store(
+            Att_Out + offs_mid_o_1,
+            e_max + tl.log(e_sum),
+        )
+
+
+def _decode_att_m_fwd(
+    q,
+    k_buffer,
+    v_buffer,
+    att_out,
+    Req_to_tokens,
+    B_Seqlen,
+    num_kv_splits,
+    sm_scale,
+    page_size,
+    logit_cap,
+):
+    BLOCK = 64
+    NUM_KV_SPLITS = num_kv_splits
+    Lk = k_buffer.shape[-1]
+    Lv = v_buffer.shape[-1]
+
+    batch, head_num = q.shape[0], q.shape[1]
+
+    grid = (batch, head_num, NUM_KV_SPLITS)
+    kv_group_num = q.shape[1] // k_buffer.shape[-2]
+
+    num_warps = 4 if kv_group_num == 1 else 2
+
+    BLOCK_DMODEL = triton.next_power_of_2(Lk)
+    BLOCK_DV = triton.next_power_of_2(Lv)
+
+    _fwd_kernel_stage1[grid](
+        q,
+        k_buffer,
+        v_buffer,
+        sm_scale,
+        Req_to_tokens,
+        B_Seqlen,
+        att_out,
+        Req_to_tokens.stride(0),
+        q.stride(0),
+        q.stride(1),
+        k_buffer.stride(-2),
+        k_buffer.stride(-1),
+        v_buffer.stride(-2),
+        v_buffer.stride(-1),
+        att_out.stride(0),
+        att_out.stride(1),
+        att_out.stride(2),
+        kv_group_num=kv_group_num,
+        BLOCK_DMODEL=BLOCK_DMODEL,
+        BLOCK_DV=BLOCK_DV,
+        BLOCK_N=BLOCK,
+        NUM_KV_SPLITS=NUM_KV_SPLITS,
+        PAGE_SIZE=page_size,
+        logit_cap=logit_cap,
+        num_warps=num_warps,
+        num_stages=2,
+        Lk=Lk,
+        Lv=Lv,
+    )
+
+
+@triton.jit
+def _fwd_grouped_kernel_stage1(
+    Q,
+    K_Buffer,
+    V_Buffer,
+    sm_scale,
+    Req_to_tokens,
+    B_Seqlen,
+    Att_Out,
+    stride_req_to_tokens_b,
+    stride_qbs,
+    stride_qh,
+    stride_buf_kbs,
+    stride_buf_kh,
+    stride_buf_vbs,
+    stride_buf_vh,
+    stride_mid_ob,
+    stride_mid_oh,
+    stride_mid_os,
+    kv_group_num: tl.constexpr,
+    q_head_num: tl.constexpr,
+    BLOCK_DMODEL: tl.constexpr,
+    BLOCK_DPE: tl.constexpr,
+    BLOCK_DV: tl.constexpr,
+    BLOCK_N: tl.constexpr,
+    BLOCK_H: tl.constexpr,
+    NUM_KV_SPLITS: tl.constexpr,
+    PAGE_SIZE: tl.constexpr,
+    logit_cap: tl.constexpr,
+    Lk: tl.constexpr,
+    Lv: tl.constexpr,
+):
+    cur_batch = tl.program_id(0)
+    cur_head_id = tl.program_id(1)
+    cur_kv_head = cur_head_id // tl.cdiv(kv_group_num, BLOCK_H)
+    split_kv_id = tl.program_id(2)
+
+    if kv_group_num > BLOCK_H:
+        VALID_BLOCK_H: tl.constexpr = BLOCK_H
+    else:
+        VALID_BLOCK_H: tl.constexpr = kv_group_num
+    cur_head = cur_head_id * VALID_BLOCK_H + tl.arange(0, BLOCK_H)
+    mask_h = cur_head < (cur_head_id + 1) * VALID_BLOCK_H
+    mask_h = mask_h & (cur_head < q_head_num)
+
+    offs_d = tl.arange(0, BLOCK_DMODEL)
+    offs_dv = tl.arange(0, BLOCK_DV)
+    mask_d = offs_d < Lk
+    mask_dv = offs_dv < Lv
+    cur_batch_seq_len = tl.load(B_Seqlen + cur_batch)
+    cur_batch_req_idx = cur_batch
+
+    offs_q = cur_batch * stride_qbs + cur_head[:, None] * stride_qh + offs_d[
+        None, :]
+    q = tl.load(Q + offs_q,
+                mask=(mask_h[:, None]) & (mask_d[None, :]),
+                other=0.0)
+
+    if BLOCK_DPE > 0:
+        offs_dpe = BLOCK_DMODEL + tl.arange(0, BLOCK_DPE)
+        mask_dpe = offs_dpe < Lk
+        off_qpe = (cur_batch * stride_qbs + cur_head[:, None] * stride_qh +
+                   offs_dpe[None, :])
+        qpe = tl.load(Q + off_qpe,
+                      mask=(mask_h[:, None]) & (mask_dpe[None, :]),
+                      other=0.0)
+
+    kv_len_per_split = tl.cdiv(cur_batch_seq_len, NUM_KV_SPLITS)
+    split_kv_start = kv_len_per_split * split_kv_id
+    split_kv_end = tl.minimum(split_kv_start + kv_len_per_split,
+                              cur_batch_seq_len)
+
+    e_max = tl.zeros([BLOCK_H], dtype=tl.float32) - float("inf")
+    e_sum = tl.zeros([BLOCK_H], dtype=tl.float32)
+    acc = tl.zeros([BLOCK_H, BLOCK_DV], dtype=tl.float32)
+
+    if split_kv_end > split_kv_start:
+        for start_n in range(split_kv_start, split_kv_end, BLOCK_N):
+            offs_n = start_n + tl.arange(0, BLOCK_N)
+            kv_page_number = tl.load(
+                Req_to_tokens + stride_req_to_tokens_b * cur_batch_req_idx +
+                offs_n // PAGE_SIZE,
+                mask=offs_n < split_kv_end,
+                other=0,
+            )
+            kv_loc = kv_page_number * PAGE_SIZE + offs_n % PAGE_SIZE
+            offs_buf_k = (kv_loc[None, :] * stride_buf_kbs +
+                          cur_kv_head * stride_buf_kh + offs_d[:, None])
+            k = tl.load(
+                K_Buffer + offs_buf_k,
+                mask=(offs_n[None, :] < split_kv_end) & (mask_d[:, None]),
+                other=0.0,
+            )
+            qk = tl.dot(q, k.to(q.dtype))
+            if BLOCK_DPE > 0:
+                offs_buf_kpe = (kv_loc[None, :] * stride_buf_kbs +
+                                cur_kv_head * stride_buf_kh +
+                                offs_dpe[:, None])
+                kpe = tl.load(
+                    K_Buffer + offs_buf_kpe,
+                    mask=(offs_n[None, :] < split_kv_end) &
+                    (mask_dpe[:, None]),
+                    other=0.0,
+                )
+                qk += tl.dot(qpe, kpe.to(qpe.dtype))
+            qk *= sm_scale
+
+            if logit_cap > 0:
+                qk = logit_cap * tanh(qk / logit_cap)
+
+            qk = tl.where(mask_h[:, None] & (offs_n[None, :] < split_kv_end),
+                          qk, float("-inf"))
+
+            offs_buf_v = (kv_loc[:, None] * stride_buf_vbs +
+                          cur_kv_head * stride_buf_vh + offs_dv[None, :])
+            v = tl.load(
+                V_Buffer + offs_buf_v,
+                mask=(offs_n[:, None] < split_kv_end) & (mask_dv[None, :]),
+                other=0.0,
+            )
+
+            n_e_max = tl.maximum(tl.max(qk, 1), e_max)
+            re_scale = tl.exp(e_max - n_e_max)
+            p = tl.exp(qk - n_e_max[:, None])
+            acc *= re_scale[:, None]
+            acc += tl.dot(p.to(v.dtype), v)
+
+            e_sum = e_sum * re_scale + tl.sum(p, 1)
+            e_max = n_e_max
+
+        offs_mid_o = (cur_batch * stride_mid_ob +
+                      cur_head[:, None] * stride_mid_oh +
+                      split_kv_id * stride_mid_os + offs_dv[None, :])
+
+        tl.store(
+            Att_Out + offs_mid_o,
+            acc / e_sum[:, None],
+            mask=(mask_h[:, None]) & (mask_dv[None, :]),
+        )
+
+        offs_mid_o_1 = (cur_batch * stride_mid_ob + cur_head * stride_mid_oh +
+                        split_kv_id * stride_mid_os + Lv)
+
+        tl.store(
+            Att_Out + offs_mid_o_1,
+            e_max + tl.log(e_sum),
+            mask=mask_h,
+        )
+
+
+def _decode_grouped_att_m_fwd(
+    q,
+    k_buffer,
+    v_buffer,
+    att_out,
+    Req_to_tokens,
+    B_Seqlen,
+    num_kv_splits,
+    sm_scale,
+    page_size,
+    logit_cap,
+):
+    BLOCK = 32
+    Lk = k_buffer.shape[-1]
+    Lv = v_buffer.shape[-1]
+
+    # [TODO] work around shmem limit on MI3xx
+    if is_hip_ and Lk >= 576:
+        BLOCK = 16
+
+    if Lk == 576:
+        BLOCK_DMODEL = 512
+        BLOCK_DPE = 64
+    elif Lk == 288:
+        BLOCK_DMODEL = 256
+        BLOCK_DPE = 32
+    else:
+        BLOCK_DMODEL = triton.next_power_of_2(Lk)
+        BLOCK_DPE = 0
+    BLOCK_DV = triton.next_power_of_2(Lv)
+
+    batch, head_num = q.shape[0], q.shape[1]
+    kv_group_num = q.shape[1] // k_buffer.shape[-2]
+
+    BLOCK_H = 16
+    NUM_KV_SPLITS = num_kv_splits
+    grid = (
+        batch,
+        triton.cdiv(head_num, min(BLOCK_H, kv_group_num)),
+        NUM_KV_SPLITS,
+    )
+
+    extra_kargs = {}
+    if is_hip_:
+        # https://rocm.docs.amd.com/en/docs-6.2.0/how-to/llm-fine-tuning-optimization/optimizing-triton-kernel.html
+        # https://github.com/triton-lang/triton/blob/main/third_party/amd/backend/compiler.py
+        extra_kargs = {
+            "waves_per_eu": 4,
+            "matrix_instr_nonkdim": 16,
+            "kpack": 2
+        }
+
+    _fwd_grouped_kernel_stage1[grid](
+        q,
+        k_buffer,
+        v_buffer,
+        sm_scale,
+        Req_to_tokens,
+        B_Seqlen,
+        att_out,
+        Req_to_tokens.stride(0),
+        q.stride(0),
+        q.stride(1),
+        k_buffer.stride(-2),
+        k_buffer.stride(-1),
+        v_buffer.stride(-2),
+        v_buffer.stride(-1),
+        att_out.stride(0),
+        att_out.stride(1),
+        att_out.stride(2),
+        kv_group_num=kv_group_num,
+        q_head_num=head_num,
+        BLOCK_DMODEL=BLOCK_DMODEL,
+        BLOCK_DPE=BLOCK_DPE,
+        BLOCK_DV=BLOCK_DV,
+        BLOCK_N=BLOCK,
+        BLOCK_H=BLOCK_H,
+        NUM_KV_SPLITS=NUM_KV_SPLITS,
+        PAGE_SIZE=page_size,
+        logit_cap=logit_cap,
+        num_warps=4,
+        num_stages=2,
+        Lk=Lk,
+        Lv=Lv,
+        **extra_kargs,
+    )
+
+
+@triton.jit
+def _fwd_kernel_stage2(
+    Mid_O,
+    o,
+    B_Seqlen,
+    stride_mid_ob,
+    stride_mid_oh,
+    stride_mid_os,
+    stride_obs,
+    stride_oh,
+    NUM_KV_SPLITS: tl.constexpr,
+    BLOCK_DV: tl.constexpr,
+    Lv: tl.constexpr,
+):
+    cur_batch = tl.program_id(0)
+    cur_head = tl.program_id(1)
+
+    cur_batch_seq_len = tl.load(B_Seqlen + cur_batch)
+
+    offs_d = tl.arange(0, BLOCK_DV)
+    mask_d = offs_d < Lv
+
+    e_sum = 0.0
+    e_max = -float("inf")
+    acc = tl.zeros([BLOCK_DV], dtype=tl.float32)
+
+    offs_v = cur_batch * stride_mid_ob + cur_head * stride_mid_oh + offs_d
+    offs_logic = cur_batch * stride_mid_ob + cur_head * stride_mid_oh + Lv
+
+    for split_kv_id in range(0, NUM_KV_SPLITS):
+        kv_len_per_split = tl.cdiv(cur_batch_seq_len, NUM_KV_SPLITS)
+        split_kv_start = kv_len_per_split * split_kv_id
+        split_kv_end = tl.minimum(split_kv_start + kv_len_per_split,
+                                  cur_batch_seq_len)
+
+        if split_kv_end > split_kv_start:
+            tv = tl.load(Mid_O + offs_v + split_kv_id * stride_mid_os,
+                         mask=mask_d,
+                         other=0.0)
+            tlogic = tl.load(Mid_O + offs_logic + split_kv_id * stride_mid_os)
+            n_e_max = tl.maximum(tlogic, e_max)
+
+            old_scale = tl.exp(e_max - n_e_max)
+            acc *= old_scale
+            exp_logic = tl.exp(tlogic - n_e_max)
+            acc += exp_logic * tv
+
+            e_sum = e_sum * old_scale + exp_logic
+            e_max = n_e_max
+
+    tl.store(
+        o + cur_batch * stride_obs + cur_head * stride_oh + offs_d,
+        acc / e_sum,
+        mask=mask_d,
+    )
+
+
+def _decode_softmax_reducev_fwd(
+    logits,
+    q,
+    o,
+    v_buffer,
+    b_seq_len,
+    num_kv_splits,
+):
+    batch, head_num = q.shape[0], q.shape[1]
+    Lv = v_buffer.shape[-1]
+    BLOCK_DV = triton.next_power_of_2(Lv)
+
+    NUM_KV_SPLITS = num_kv_splits
+
+    extra_kargs = {}
+    if is_hip_:
+        # https://rocm.docs.amd.com/en/docs-6.2.0/how-to/llm-fine-tuning-optimization/optimizing-triton-kernel.html
+        # https://github.com/triton-lang/triton/blob/main/third_party/amd/backend/compiler.py
+        extra_kargs = {
+            "waves_per_eu": 4,
+            "matrix_instr_nonkdim": 16,
+            "kpack": 2
+        }
+
+    grid = (batch, head_num)
+    _fwd_kernel_stage2[grid](
+        logits,
+        o,
+        b_seq_len,
+        logits.stride(0),
+        logits.stride(1),
+        logits.stride(2),
+        o.stride(0),
+        o.stride(1),
+        NUM_KV_SPLITS=NUM_KV_SPLITS,
+        BLOCK_DV=BLOCK_DV,
+        Lv=Lv,
+        num_warps=4,
+        num_stages=2,
+        **extra_kargs,
+    )
+
+
+def decode_attention_fwd_normal(
+    q,
+    k_buffer,
+    v_buffer,
+    o,
+    req_to_token,
+    b_seq_len,
+    attn_logits,
+    num_kv_splits,
+    sm_scale,
+    page_size,
+    logit_cap=0.0,
+):
+    _decode_att_m_fwd(
+        q,
+        k_buffer,
+        v_buffer,
+        attn_logits,
+        req_to_token,
+        b_seq_len,
+        num_kv_splits,
+        sm_scale,
+        page_size,
+        logit_cap,
+    )
+    _decode_softmax_reducev_fwd(attn_logits, q, o, v_buffer, b_seq_len,
+                                num_kv_splits)
+
+
+def decode_attention_fwd_grouped(
+    q,
+    k_buffer,
+    v_buffer,
+    o,
+    req_to_token,
+    b_seq_len,
+    attn_logits,
+    num_kv_splits,
+    sm_scale,
+    page_size,
+    logit_cap=0.0,
+):
+    _decode_grouped_att_m_fwd(
+        q,
+        k_buffer,
+        v_buffer,
+        attn_logits,
+        req_to_token,
+        b_seq_len,
+        num_kv_splits,
+        sm_scale,
+        page_size,
+        logit_cap,
+    )
+    _decode_softmax_reducev_fwd(attn_logits, q, o, v_buffer, b_seq_len,
+                                num_kv_splits)
+
+
+def decode_attention_fwd(
+    q,
+    k_buffer,
+    v_buffer,
+    o,
+    req_to_token,
+    b_seq_len,
+    attn_logits,
+    num_kv_splits,
+    sm_scale,
+    page_size=1,
+    logit_cap=0.0,
+):
+    assert num_kv_splits == attn_logits.shape[2]
+    kv_group_num = q.shape[1] // v_buffer.shape[-2]
+
+    if kv_group_num == 1:
+        # MHA
+        decode_attention_fwd_normal(
+            q,
+            k_buffer,
+            v_buffer,
+            o,
+            req_to_token,
+            b_seq_len,
+            attn_logits,
+            num_kv_splits,
+            sm_scale,
+            page_size,
+            logit_cap,
+        )
+    else:
+        # GQA/MQA/MLA
+        decode_attention_fwd_grouped(
+            q,
+            k_buffer,
+            v_buffer,
+            o,
+            req_to_token,
+            b_seq_len,
+            attn_logits,
+            num_kv_splits,
+            sm_scale,
+            page_size,
+            logit_cap,
+        )
diff --git a/vllm/attention/ops/triton_flash_attention.py b/vllm/attention/ops/triton_flash_attention.py
index f94211116a746..ab8fb89536412 100644
--- a/vllm/attention/ops/triton_flash_attention.py
+++ b/vllm/attention/ops/triton_flash_attention.py
@@ -1,3 +1,5 @@
+# SPDX-License-Identifier: Apache-2.0
+
 #!/usr/bin/env python
 """
 Fused Attention
@@ -627,8 +629,8 @@ def attn_fwd(
                                         causal_start_idx,
                                         dtype=tl.int32)
             mask_m_offsets = start_m_idx + tl.arange(0, BLOCK_M)
-            out_ptrs_mask = (mask_m_offsets[:, None] >=
-                             out_mask_boundary[None, :])
+            out_ptrs_mask = (mask_m_offsets[:, None]
+                             >= out_mask_boundary[None, :])
             z = 0.0
             acc = tl.where(out_ptrs_mask, acc, z.to(acc.type.element_ty))
     # write back LSE
diff --git a/vllm/attention/selector.py b/vllm/attention/selector.py
index 81ea6eefb5410..26c6ac812a125 100644
--- a/vllm/attention/selector.py
+++ b/vllm/attention/selector.py
@@ -1,6 +1,8 @@
+# SPDX-License-Identifier: Apache-2.0
+
 import os
 from contextlib import contextmanager
-from functools import lru_cache
+from functools import cache
 from typing import Generator, Optional, Type
 
 import torch
@@ -83,6 +85,7 @@ def get_attn_backend(
     block_size: int,
     is_attention_free: bool,
     is_blocksparse: bool = False,
+    use_mla: bool = False,
 ) -> Type[AttentionBackend]:
     """Selects which attention backend to use and lazily imports it."""
     # Accessing envs.* behind an @lru_cache decorator can cause the wrong
@@ -97,10 +100,11 @@ def get_attn_backend(
         is_attention_free=is_attention_free,
         is_blocksparse=is_blocksparse,
         use_v1=envs.VLLM_USE_V1,
+        use_mla=use_mla,
     )
 
 
-@lru_cache(maxsize=None)
+@cache
 def _cached_get_attn_backend(
     head_size: int,
     dtype: torch.dtype,
@@ -109,6 +113,7 @@ def _cached_get_attn_backend(
     is_attention_free: bool,
     is_blocksparse: bool = False,
     use_v1: bool = False,
+    use_mla: bool = False,
 ) -> Type[AttentionBackend]:
     if is_blocksparse:
         logger.info("Using BlocksparseFlashAttention backend.")
@@ -141,7 +146,8 @@ def _cached_get_attn_backend(
 
     # get device-specific attn_backend
     attention_cls = current_platform.get_attn_backend_cls(
-        selected_backend, head_size, dtype, kv_cache_dtype, block_size, use_v1)
+        selected_backend, head_size, dtype, kv_cache_dtype, block_size, use_v1,
+        use_mla)
     if not attention_cls:
         raise ValueError(
             f"Invalid attention backend for {current_platform.device_name}")
diff --git a/vllm/beam_search.py b/vllm/beam_search.py
index 026037e5434d1..97b2b630fc3e5 100644
--- a/vllm/beam_search.py
+++ b/vllm/beam_search.py
@@ -1,3 +1,5 @@
+# SPDX-License-Identifier: Apache-2.0
+
 from dataclasses import dataclass
 from typing import TYPE_CHECKING, Any, Dict, List, Optional, Union
 
diff --git a/vllm/compilation/backends.py b/vllm/compilation/backends.py
index fc1515f4b9620..a2ad5c60e70b5 100644
--- a/vllm/compilation/backends.py
+++ b/vllm/compilation/backends.py
@@ -1,3 +1,5 @@
+# SPDX-License-Identifier: Apache-2.0
+
 import ast
 import copy
 import dataclasses
diff --git a/vllm/compilation/counter.py b/vllm/compilation/counter.py
index 6385f1c5dbf81..a6f11a3af4d4c 100644
--- a/vllm/compilation/counter.py
+++ b/vllm/compilation/counter.py
@@ -1,3 +1,5 @@
+# SPDX-License-Identifier: Apache-2.0
+
 import copy
 import dataclasses
 from contextlib import contextmanager
diff --git a/vllm/compilation/decorators.py b/vllm/compilation/decorators.py
index 17eb0592ced6d..20afe6967df39 100644
--- a/vllm/compilation/decorators.py
+++ b/vllm/compilation/decorators.py
@@ -1,3 +1,5 @@
+# SPDX-License-Identifier: Apache-2.0
+
 import inspect
 from typing import Callable, Dict, List, Optional, TypeVar, Union, overload
 from unittest.mock import patch
diff --git a/vllm/compilation/fix_functionalization.py b/vllm/compilation/fix_functionalization.py
index e15d7b315c50f..9b0e9c5d04081 100644
--- a/vllm/compilation/fix_functionalization.py
+++ b/vllm/compilation/fix_functionalization.py
@@ -1,3 +1,5 @@
+# SPDX-License-Identifier: Apache-2.0
+
 import operator
 from typing import Dict, Iterable, List, Optional, Tuple, Union
 
diff --git a/vllm/compilation/fusion.py b/vllm/compilation/fusion.py
index cde27bd108212..0c3d8697b2375 100644
--- a/vllm/compilation/fusion.py
+++ b/vllm/compilation/fusion.py
@@ -1,3 +1,5 @@
+# SPDX-License-Identifier: Apache-2.0
+
 from typing import Callable, Dict, List, NamedTuple, Optional, Tuple
 
 import torch
diff --git a/vllm/compilation/fx_utils.py b/vllm/compilation/fx_utils.py
index 924e26f2e262e..b9a8d3112e775 100644
--- a/vllm/compilation/fx_utils.py
+++ b/vllm/compilation/fx_utils.py
@@ -1,3 +1,5 @@
+# SPDX-License-Identifier: Apache-2.0
+
 import operator
 from typing import Iterable, Optional
 
diff --git a/vllm/compilation/inductor_pass.py b/vllm/compilation/inductor_pass.py
index f6846c08ac841..be663946f4d81 100644
--- a/vllm/compilation/inductor_pass.py
+++ b/vllm/compilation/inductor_pass.py
@@ -1,3 +1,5 @@
+# SPDX-License-Identifier: Apache-2.0
+
 import hashlib
 import inspect
 import types
diff --git a/vllm/compilation/monitor.py b/vllm/compilation/monitor.py
index b97e40415b41b..786c7c1e1859a 100644
--- a/vllm/compilation/monitor.py
+++ b/vllm/compilation/monitor.py
@@ -1,3 +1,5 @@
+# SPDX-License-Identifier: Apache-2.0
+
 import os
 import time
 
diff --git a/vllm/compilation/multi_output_match.py b/vllm/compilation/multi_output_match.py
index b6bcecdc89e26..e6f6a60b25950 100644
--- a/vllm/compilation/multi_output_match.py
+++ b/vllm/compilation/multi_output_match.py
@@ -1,3 +1,5 @@
+# SPDX-License-Identifier: Apache-2.0
+
 import abc
 import operator
 from abc import abstractmethod
diff --git a/vllm/compilation/pass_manager.py b/vllm/compilation/pass_manager.py
index 34f5f355798b2..c7387fb7c2db9 100644
--- a/vllm/compilation/pass_manager.py
+++ b/vllm/compilation/pass_manager.py
@@ -1,3 +1,5 @@
+# SPDX-License-Identifier: Apache-2.0
+
 from typing import Any, Dict, List
 
 from torch import fx as fx
diff --git a/vllm/compilation/reshapes.py b/vllm/compilation/reshapes.py
index ba28b1f0be7bd..292baae852822 100644
--- a/vllm/compilation/reshapes.py
+++ b/vllm/compilation/reshapes.py
@@ -1,3 +1,5 @@
+# SPDX-License-Identifier: Apache-2.0
+
 from typing import Union
 
 import torch.fx
diff --git a/vllm/compilation/vllm_inductor_pass.py b/vllm/compilation/vllm_inductor_pass.py
index b8c52a7f46838..1d2597e42711f 100644
--- a/vllm/compilation/vllm_inductor_pass.py
+++ b/vllm/compilation/vllm_inductor_pass.py
@@ -1,3 +1,5 @@
+# SPDX-License-Identifier: Apache-2.0
+
 import time
 
 import torch
diff --git a/vllm/compilation/wrapper.py b/vllm/compilation/wrapper.py
index 58a8fa76f6ce2..a8a283ddd8c0c 100644
--- a/vllm/compilation/wrapper.py
+++ b/vllm/compilation/wrapper.py
@@ -1,3 +1,5 @@
+# SPDX-License-Identifier: Apache-2.0
+
 import os
 import sys
 from abc import abstractmethod
diff --git a/vllm/config.py b/vllm/config.py
index 7ae8e7420c498..817f46e2a5fb0 100644
--- a/vllm/config.py
+++ b/vllm/config.py
@@ -1,3 +1,5 @@
+# SPDX-License-Identifier: Apache-2.0
+
 import ast
 import copy
 import enum
@@ -67,7 +69,8 @@ _RUNNER_TASKS: Dict[RunnerType, List[_ResolvedTask]] = {
 
 _TASK_RUNNER: Dict[_ResolvedTask, RunnerType] = {
     task: runner
-    for runner, tasks in _RUNNER_TASKS.items() for task in tasks
+    for runner, tasks in _RUNNER_TASKS.items()
+    for task in tasks
 }
 
 HfOverrides = Union[Dict[str, Any], Callable[[PretrainedConfig],
@@ -80,6 +83,12 @@ class SupportsHash(Protocol):
         ...
 
 
+class ModelImpl(str, enum.Enum):
+    AUTO = "auto"
+    VLLM = "vllm"
+    TRANSFORMERS = "transformers"
+
+
 class ModelConfig:
     """Configuration for the model.
 
@@ -164,6 +173,14 @@ class ModelConfig:
             `logits_processors` extra completion argument. Defaults to None,
             which allows no processors.
         generation_config: Configuration parameter file for generation.
+        model_impl: Which implementation of the model to use:
+            "auto" will try to use the vLLM implementation if it exists and
+                fall back to the Transformers implementation if no vLLM
+                implementation is available.
+            "vllm" will use the vLLM model implementation.
+            "transformers" will use the Transformers model implementation.
+        override_generation_config: Override the generation config with the
+            given config.
     """
 
     def compute_hash(self) -> str:
@@ -224,6 +241,8 @@ class ModelConfig:
         logits_processor_pattern: Optional[str] = None,
         generation_config: Optional[str] = None,
         enable_sleep_mode: bool = False,
+        override_generation_config: Optional[Dict[str, Any]] = None,
+        model_impl: Union[str, ModelImpl] = ModelImpl.AUTO,
     ) -> None:
         self.model = model
         self.tokenizer = tokenizer
@@ -235,6 +254,7 @@ class ModelConfig:
         self.code_revision = code_revision
         self.rope_scaling = rope_scaling
         self.rope_theta = rope_theta
+        self.model_impl = model_impl
 
         if hf_overrides is None:
             hf_overrides = {}
@@ -310,14 +330,15 @@ class ModelConfig:
             (self.hf_text_config.model_type in ["gemma2", "cohere2"]))
 
         if (not self.disable_sliding_window and has_interleaved_attention):
-            if envs.VLLM_ATTENTION_BACKEND == "XFORMERS":
+            if (backend :=
+                    envs.VLLM_ATTENTION_BACKEND) in ("XFORMERS", "FLASHINFER"):
                 sliding_window_len_min = get_min_sliding_window(
                     self.hf_text_config.sliding_window)
 
                 logger.warning_once(
                     f"{self.hf_text_config.model_type} has interleaved "
                     "attention, which is currently not supported by the "
-                    "XFORMERS backend. Disabling sliding window and capping "
+                    f"{backend} backend. Disabling sliding window and capping "
                     "the max length to the sliding window size "
                     f"({sliding_window_len_min}).")
                 self.disable_sliding_window = True
@@ -366,6 +387,7 @@ class ModelConfig:
         self.logits_processor_pattern = logits_processor_pattern
 
         self.generation_config = generation_config
+        self.override_generation_config = override_generation_config or {}
 
         self._verify_quantization()
         self._verify_cuda_graph()
@@ -730,17 +752,26 @@ class ModelConfig:
     def get_hidden_size(self) -> int:
         return self.hf_text_config.hidden_size
 
+    @property
+    def is_deepseek_mla(self) -> bool:
+        # TODO add deepseek_v3
+        return (hasattr(self.hf_text_config, "model_type")) \
+                and (self.hf_text_config.model_type in \
+                    ('deepseek_v2', 'deepseek_v3'))\
+                and (self.hf_text_config.kv_lora_rank is not None)
+
     def get_head_size(self) -> int:
         # TODO remove hard code
-        if hasattr(self.hf_text_config,
-                   "model_type") and (self.hf_text_config.model_type
-                                      in ('deepseek_v2', 'deepseek_v3')):
+        if self.is_deepseek_mla:
             qk_rope_head_dim = getattr(self.hf_text_config, "qk_rope_head_dim",
                                        0)
-            qk_nope_head_dim = getattr(self.hf_text_config, "qk_nope_head_dim",
-                                       0)
-            if qk_rope_head_dim and qk_nope_head_dim:
-                return qk_rope_head_dim + qk_nope_head_dim
+            if self.use_mla:
+                return self.hf_text_config.kv_lora_rank + qk_rope_head_dim
+            else:
+                qk_nope_head_dim = getattr(self.hf_text_config,
+                                           "qk_nope_head_dim", 0)
+                if qk_rope_head_dim and qk_nope_head_dim:
+                    return qk_rope_head_dim + qk_nope_head_dim
 
         if self.is_attention_free:
             return 0
@@ -799,6 +830,10 @@ class ModelConfig:
 
     def get_num_kv_heads(self, parallel_config: "ParallelConfig") -> int:
         """Returns the number of KV heads per GPU."""
+        if self.use_mla:
+            # When using MLA during decode it becomes MQA
+            return 1
+
         total_num_kv_heads = self.get_total_num_kv_heads()
         # If tensor parallelism is used, we divide the number of KV heads by
         # the tensor parallel size. We will replicate the KV heads in the
@@ -902,20 +937,31 @@ class ModelConfig:
         """
         if self.generation_config is None:
             # When generation_config is not set
-            return {}
-        config = self.try_get_generation_config()
+            config = {}
+        else:
+            config = self.try_get_generation_config()
+
+        # Overriding with given generation config
+        config.update(self.override_generation_config)
+
         available_params = [
             "repetition_penalty",
             "temperature",
             "top_k",
             "top_p",
             "min_p",
+            "max_new_tokens",
         ]
         if any(p in config for p in available_params):
             diff_sampling_param = {
                 p: config.get(p)
                 for p in available_params if config.get(p) is not None
             }
+            # Huggingface definition of max_new_tokens is equivalent
+            # to vLLM's max_tokens
+            if "max_new_tokens" in diff_sampling_param:
+                diff_sampling_param["max_tokens"] = diff_sampling_param.pop(
+                    "max_new_tokens")
         else:
             diff_sampling_param = {}
         return diff_sampling_param
@@ -938,6 +984,37 @@ class ModelConfig:
         architectures = getattr(self.hf_config, "architectures", [])
         return ModelRegistry.is_cross_encoder_model(architectures)
 
+    @property
+    def use_mla(self) -> bool:
+        if self.quantization is not None and self.quantization not in [\
+            "fp8", "compressed-tensors"]:
+            logger.warning(
+                "MLA is not supported with %s quantization. "
+                "Disabling MLA.", self.quantization)
+            return False
+
+        # If using a "compressed-tensors" checkpoint, check that all groups
+        # have fp8 for both weights and activations.
+        if self.quantization == "compressed-tensors":
+            quant_config = self._parse_quant_hf_config()
+            for group_name, cfg in quant_config.get("config_groups",
+                                                    ("", {})).items():
+                act_cfg = cfg.get("input_activations", {})
+                act_type = None if act_cfg is None else act_cfg.get("type", "")
+                w_cfg = cfg.get("weights", {})
+                w_type = None if w_cfg is None else w_cfg.get("type", "")
+                if act_type != "fp8" or w_type != "fp8":
+                    logger.warning(
+                        "compressed-tensors MLA support requires fp8 "
+                        "activations and weights in group '%s', but got "
+                        "activations type '%s' and weights type '%s'.\n "
+                        "Full config: %s", group_name, act_type, w_type,
+                        quant_config)
+                    return False
+
+        use_mla = (self.is_deepseek_mla and not envs.VLLM_MLA_DISABLE)
+        return use_mla
+
     @property
     def supported_runner_types(self) -> Set[RunnerType]:
         return {_TASK_RUNNER[task] for task in self.supported_tasks}
@@ -1227,9 +1304,6 @@ class ParallelConfig:
     pipeline_parallel_size: int = 1  # Number of pipeline parallel groups.
     tensor_parallel_size: int = 1  # Number of tensor parallel groups.
 
-    # Deprecated, use distributed_executor_backend instead.
-    worker_use_ray: Optional[bool] = None
-
     # Maximum number of multiple batches
     # when load model sequentially. To avoid RAM OOM when using tensor
     # parallel and large models.
@@ -1283,13 +1357,6 @@ class ParallelConfig:
         self.world_size = self.pipeline_parallel_size * \
             self.tensor_parallel_size
 
-        if self.worker_use_ray:
-            if self.distributed_executor_backend is None:
-                self.distributed_executor_backend = "ray"
-            elif not self.use_ray:
-                raise ValueError(f"worker-use-ray can't be used with "
-                                 f"distributed executor backend "
-                                 f"'{self.distributed_executor_backend}'.")
         ray_only_devices = ["tpu"]
         from vllm.platforms import current_platform
         if (current_platform.device_type in ray_only_devices
@@ -1688,7 +1755,8 @@ class SpeculativeConfig:
             raise ValueError("Expect the batch size threshold of disabling "
                              "speculative decoding is > 1, but got "
                              f"{speculative_disable_by_batch_size=}")
-
+        if (enable_chunked_prefill and speculative_model == "eagle"):
+            raise ValueError("Chunked prefill and EAGLE are not compatible.")
         # TODO: The user should be able to specify revision/max model len
         # for the draft model. It is not currently supported.
         draft_revision = None
@@ -1755,12 +1823,6 @@ class SpeculativeConfig:
                         f"num_speculative_tokens={n_predict}, but "
                         f"{num_speculative_tokens=} was provided.")
 
-            if enable_chunked_prefill and draft_hf_config.model_type in (
-                    "medusa", "mlp_speculator", "eagle"):
-                raise ValueError(
-                    "Chunked prefill and hidden-state based draft models are "
-                    "not compatible.")
-
             speculative_draft_tensor_parallel_size = \
                 SpeculativeConfig._verify_and_get_draft_model_tensor_parallel_size(
                     target_parallel_config,
@@ -1984,8 +2046,8 @@ class SpeculativeConfig:
                              "typical_acceptance_sampler.")
 
         if (self.draft_token_acceptance_method != 'rejection_sampler'
-                and self.draft_token_acceptance_method !=
-                'typical_acceptance_sampler'):
+                and self.draft_token_acceptance_method
+                != 'typical_acceptance_sampler'):
             raise ValueError(
                 "Expected draft_token_acceptance_method to be either "
                 "rejection_sampler or typical_acceptance_sampler. Instead it "
@@ -3208,6 +3270,16 @@ class VllmConfig:
 
         current_platform.check_and_update_config(self)
 
+        # If MLA is enabled, force disable chunked prefill and prefix caching
+        if self.model_config and self.model_config.use_mla:
+            logger.info("MLA is enabled; forcing chunked prefill and prefix "
+                        "caching to be disabled.")
+            self.scheduler_config.enable_chunked_prefill = False
+            self.scheduler_config.chunked_prefill_enabled = False
+
+            if self.cache_config is not None:
+                self.cache_config.enable_prefix_caching = False
+
         if not self.instance_id:
             self.instance_id = random_uuid()[:5]
 
@@ -3315,7 +3387,7 @@ _current_vllm_config: Optional[VllmConfig] = None
 
 
 @contextmanager
-def set_current_vllm_config(vllm_config: VllmConfig):
+def set_current_vllm_config(vllm_config: VllmConfig, check_compile=False):
     """
     Temporarily set the current VLLM config.
     Used during model initialization.
@@ -3335,7 +3407,8 @@ def set_current_vllm_config(vllm_config: VllmConfig):
                      vllm_config.compilation_config.enabled_custom_ops)
         logger.debug("disabled custom ops: %s",
                      vllm_config.compilation_config.disabled_custom_ops)
-        if vllm_config.compilation_config.level == CompilationLevel.PIECEWISE \
+        if check_compile and \
+            vllm_config.compilation_config.level == CompilationLevel.PIECEWISE \
             and compilation_counter.num_models_seen == num_models_seen:
             # If the model supports compilation,
             # compilation_counter.num_models_seen should be increased
diff --git a/vllm/connections.py b/vllm/connections.py
index 4c9f4f40cf640..dc060bb6f88a7 100644
--- a/vllm/connections.py
+++ b/vllm/connections.py
@@ -1,3 +1,5 @@
+# SPDX-License-Identifier: Apache-2.0
+
 from pathlib import Path
 from typing import Mapping, MutableMapping, Optional
 from urllib.parse import urlparse
diff --git a/vllm/core/block/block_table.py b/vllm/core/block/block_table.py
index 90c1438efbd08..d4d31c58dc8d4 100644
--- a/vllm/core/block/block_table.py
+++ b/vllm/core/block/block_table.py
@@ -1,3 +1,5 @@
+# SPDX-License-Identifier: Apache-2.0
+
 import math
 from typing import List, Optional
 
diff --git a/vllm/core/block/common.py b/vllm/core/block/common.py
index c03b5932eafb6..1966eac1cf9e0 100644
--- a/vllm/core/block/common.py
+++ b/vllm/core/block/common.py
@@ -1,3 +1,5 @@
+# SPDX-License-Identifier: Apache-2.0
+
 from collections import deque
 from dataclasses import dataclass
 from typing import Deque, Dict, Iterable, List, Optional, Protocol, Tuple
@@ -34,9 +36,10 @@ class RefCounter(RefCounterProtocol):
 
     def __init__(self, all_block_indices: Iterable[BlockId]):
         deduped = set(all_block_indices)
-        self._refcounts: Dict[BlockId,
-                              RefCount] = {index: 0
-                                           for index in deduped}
+        self._refcounts: Dict[BlockId, RefCount] = {
+            index: 0
+            for index in deduped
+        }
 
     def incr(self, block_id: BlockId) -> RefCount:
         assert block_id in self._refcounts
diff --git a/vllm/core/block/cpu_gpu_block_allocator.py b/vllm/core/block/cpu_gpu_block_allocator.py
index c3e1665b4464e..359b5b263f689 100644
--- a/vllm/core/block/cpu_gpu_block_allocator.py
+++ b/vllm/core/block/cpu_gpu_block_allocator.py
@@ -1,3 +1,5 @@
+# SPDX-License-Identifier: Apache-2.0
+
 from typing import Dict, FrozenSet, List, Optional, Tuple
 
 from vllm.core.block.interfaces import (Block, BlockAllocator, BlockId,
diff --git a/vllm/core/block/interfaces.py b/vllm/core/block/interfaces.py
index cb432db919c73..0b0197deb8d47 100644
--- a/vllm/core/block/interfaces.py
+++ b/vllm/core/block/interfaces.py
@@ -1,3 +1,5 @@
+# SPDX-License-Identifier: Apache-2.0
+
 from abc import ABC, abstractmethod
 from typing import Dict, FrozenSet, List, Optional, Protocol, Tuple
 
diff --git a/vllm/core/block/naive_block.py b/vllm/core/block/naive_block.py
index c38ae2dd6761b..c388366b825f2 100644
--- a/vllm/core/block/naive_block.py
+++ b/vllm/core/block/naive_block.py
@@ -1,3 +1,5 @@
+# SPDX-License-Identifier: Apache-2.0
+
 from collections import deque
 from typing import Deque, FrozenSet, Iterable, List, Optional, Tuple, Union
 
diff --git a/vllm/core/block/prefix_caching_block.py b/vllm/core/block/prefix_caching_block.py
index ccdc5daa9595c..fbf19e1b461f7 100644
--- a/vllm/core/block/prefix_caching_block.py
+++ b/vllm/core/block/prefix_caching_block.py
@@ -1,3 +1,4 @@
+# SPDX-License-Identifier: Apache-2.0
 """Token blocks."""
 import sys
 from bisect import bisect_left
diff --git a/vllm/core/block/utils.py b/vllm/core/block/utils.py
index 1c6578e4cc6ab..910afdd9feff1 100644
--- a/vllm/core/block/utils.py
+++ b/vllm/core/block/utils.py
@@ -1,3 +1,4 @@
+# SPDX-License-Identifier: Apache-2.0
 """Block manager utils."""
 from vllm.sequence import SequenceGroup
 from vllm.utils import (STR_NOT_IMPL_ENC_DEC_PREFIX_CACHE,
diff --git a/vllm/core/block_manager.py b/vllm/core/block_manager.py
index 62a5f0bda061a..c5b3b04f37ca3 100644
--- a/vllm/core/block_manager.py
+++ b/vllm/core/block_manager.py
@@ -1,3 +1,4 @@
+# SPDX-License-Identifier: Apache-2.0
 """A block manager that manages token blocks."""
 from typing import Dict, List, Optional
 from typing import Sequence as GenericSequence
@@ -136,8 +137,8 @@ class SelfAttnBlockSpaceManager(BlockSpaceManager):
             device=Device.GPU)
 
         # Use watermark to avoid frequent cache eviction.
-        if (self.num_total_gpu_blocks - num_required_blocks <
-                self.watermark_blocks):
+        if (self.num_total_gpu_blocks - num_required_blocks
+                < self.watermark_blocks):
             return AllocStatus.NEVER
         if num_free_gpu_blocks - num_required_blocks >= self.watermark_blocks:
             return AllocStatus.OK
diff --git a/vllm/core/evictor.py b/vllm/core/evictor.py
index c9306518223a3..0e363eddc8a5e 100644
--- a/vllm/core/evictor.py
+++ b/vllm/core/evictor.py
@@ -1,3 +1,5 @@
+# SPDX-License-Identifier: Apache-2.0
+
 import enum
 import heapq
 from abc import ABC, abstractmethod
diff --git a/vllm/core/interfaces.py b/vllm/core/interfaces.py
index 9c7e246e3c4ed..b48ba87e95a0b 100644
--- a/vllm/core/interfaces.py
+++ b/vllm/core/interfaces.py
@@ -1,3 +1,5 @@
+# SPDX-License-Identifier: Apache-2.0
+
 import enum
 from abc import ABC, abstractmethod
 from typing import List
diff --git a/vllm/core/placeholder_block_space_manager.py b/vllm/core/placeholder_block_space_manager.py
index f9924be4a3835..70c22afa8e158 100644
--- a/vllm/core/placeholder_block_space_manager.py
+++ b/vllm/core/placeholder_block_space_manager.py
@@ -1,3 +1,5 @@
+# SPDX-License-Identifier: Apache-2.0
+
 from typing import List, Tuple
 
 from vllm.core.interfaces import AllocStatus, BlockSpaceManager
diff --git a/vllm/core/scheduler.py b/vllm/core/scheduler.py
index b1630b34947bd..f507847ad82cf 100644
--- a/vllm/core/scheduler.py
+++ b/vllm/core/scheduler.py
@@ -1,3 +1,5 @@
+# SPDX-License-Identifier: Apache-2.0
+
 import enum
 import os
 import random
@@ -988,8 +990,8 @@ class Scheduler:
                     waiting_queue.popleft()
                     continue
 
-            if (budget.num_batched_tokens >=
-                    self.scheduler_config.max_num_batched_tokens):
+            if (budget.num_batched_tokens
+                    >= self.scheduler_config.max_num_batched_tokens):
                 # We've reached the budget limit - since there might be
                 # continuous prefills in the running queue, we should break
                 # to avoid scheduling any new prefills.
@@ -1096,8 +1098,8 @@ class Scheduler:
                     running_scheduled.swapped_out) == 0:
                 swapped_in = self._schedule_swapped(budget, curr_loras)
 
-        assert (budget.num_batched_tokens <=
-                self.scheduler_config.max_num_batched_tokens)
+        assert (budget.num_batched_tokens
+                <= self.scheduler_config.max_num_batched_tokens)
         assert budget.num_curr_seqs <= self.scheduler_config.max_num_seqs
 
         # Update waiting requests.
@@ -1189,8 +1191,8 @@ class Scheduler:
                                            curr_loras,
                                            enable_chunking=True)
 
-        assert (budget.num_batched_tokens <=
-                self.scheduler_config.max_num_batched_tokens)
+        assert (budget.num_batched_tokens
+                <= self.scheduler_config.max_num_batched_tokens)
         assert budget.num_curr_seqs <= self.scheduler_config.max_num_seqs
 
         # Update waiting requests.
@@ -1358,8 +1360,8 @@ class Scheduler:
                 # NOTE: We use get_len instead of get_prompt_len because when
                 # a sequence is preempted, prefill includes previous generated
                 # output tokens.
-                if (token_chunk_size + num_computed_tokens <
-                        seqs[0].data.get_len()):
+                if (token_chunk_size + num_computed_tokens
+                        < seqs[0].data.get_len()):
                     do_sample = False
 
             # It assumes the scheduled_seq_groups is ordered by
@@ -1625,10 +1627,9 @@ class Scheduler:
         if self.scheduler_config.delay_factor > 0 and self.waiting:
             earliest_arrival_time = min(
                 [e.metrics.arrival_time for e in self.waiting])
-            passed_delay = (
-                (now - earliest_arrival_time) >
-                (self.scheduler_config.delay_factor * self.last_prompt_latency)
-                or not self.running)
+            passed_delay = ((now - earliest_arrival_time)
+                            > (self.scheduler_config.delay_factor *
+                               self.last_prompt_latency) or not self.running)
         else:
             passed_delay = True
         return passed_delay
diff --git a/vllm/device_allocator/cumem.py b/vllm/device_allocator/cumem.py
index a43418dbb3b46..f74ad9ac33852 100644
--- a/vllm/device_allocator/cumem.py
+++ b/vllm/device_allocator/cumem.py
@@ -1,3 +1,5 @@
+# SPDX-License-Identifier: Apache-2.0
+
 # cumem-based pytorch pluggable allocator to implement sleep mode.
 # other approaches tried but failed:
 # - cuda-python package binding
diff --git a/vllm/distributed/__init__.py b/vllm/distributed/__init__.py
index db325cfabf55e..39955ddacfe94 100644
--- a/vllm/distributed/__init__.py
+++ b/vllm/distributed/__init__.py
@@ -1,3 +1,5 @@
+# SPDX-License-Identifier: Apache-2.0
+
 from .communication_op import *
 from .parallel_state import *
 from .utils import *
diff --git a/vllm/distributed/communication_op.py b/vllm/distributed/communication_op.py
index e13505dc37bb0..0228264f91f9a 100644
--- a/vllm/distributed/communication_op.py
+++ b/vllm/distributed/communication_op.py
@@ -1,3 +1,5 @@
+# SPDX-License-Identifier: Apache-2.0
+
 from typing import Any, Dict, Optional, Union
 
 import torch
diff --git a/vllm/distributed/device_communicators/cuda_wrapper.py b/vllm/distributed/device_communicators/cuda_wrapper.py
index d5a53381ce621..010caf7ebac97 100644
--- a/vllm/distributed/device_communicators/cuda_wrapper.py
+++ b/vllm/distributed/device_communicators/cuda_wrapper.py
@@ -1,3 +1,4 @@
+# SPDX-License-Identifier: Apache-2.0
 """This file is a pure Python wrapper for the cudart library.
 It avoids the need to compile a separate shared library, and is
 convenient for use when we just need to call a few functions.
diff --git a/vllm/distributed/device_communicators/custom_all_reduce.py b/vllm/distributed/device_communicators/custom_all_reduce.py
index 62929dc0feaaf..a2614ed5d0bdb 100644
--- a/vllm/distributed/device_communicators/custom_all_reduce.py
+++ b/vllm/distributed/device_communicators/custom_all_reduce.py
@@ -1,3 +1,5 @@
+# SPDX-License-Identifier: Apache-2.0
+
 import ctypes
 from contextlib import contextmanager
 from typing import List, Optional, Union
diff --git a/vllm/distributed/device_communicators/custom_all_reduce_utils.py b/vllm/distributed/device_communicators/custom_all_reduce_utils.py
index 1f78e10cc1dcd..d8d6eed2dd7ec 100644
--- a/vllm/distributed/device_communicators/custom_all_reduce_utils.py
+++ b/vllm/distributed/device_communicators/custom_all_reduce_utils.py
@@ -1,3 +1,5 @@
+# SPDX-License-Identifier: Apache-2.0
+
 import ctypes
 import json
 import os
diff --git a/vllm/distributed/device_communicators/hpu_communicator.py b/vllm/distributed/device_communicators/hpu_communicator.py
index cc9b19ce022b5..3f85da98aca43 100644
--- a/vllm/distributed/device_communicators/hpu_communicator.py
+++ b/vllm/distributed/device_communicators/hpu_communicator.py
@@ -1,3 +1,5 @@
+# SPDX-License-Identifier: Apache-2.0
+
 import torch
 import torch.distributed as dist
 from torch.distributed import ProcessGroup
diff --git a/vllm/distributed/device_communicators/pynccl.py b/vllm/distributed/device_communicators/pynccl.py
index efc59987195f5..0ccd423121cb0 100644
--- a/vllm/distributed/device_communicators/pynccl.py
+++ b/vllm/distributed/device_communicators/pynccl.py
@@ -1,3 +1,5 @@
+# SPDX-License-Identifier: Apache-2.0
+
 from typing import Optional, Union
 
 # ===================== import region =====================
diff --git a/vllm/distributed/device_communicators/pynccl_wrapper.py b/vllm/distributed/device_communicators/pynccl_wrapper.py
index 7dea61b6a09f1..03c3b0be76393 100644
--- a/vllm/distributed/device_communicators/pynccl_wrapper.py
+++ b/vllm/distributed/device_communicators/pynccl_wrapper.py
@@ -1,3 +1,5 @@
+# SPDX-License-Identifier: Apache-2.0
+
 # This file is a pure Python wrapper for the NCCL library.
 # The main purpose is to use NCCL combined with CUDA graph.
 # Before writing this script, we tried the following approach:
diff --git a/vllm/distributed/device_communicators/shm_broadcast.py b/vllm/distributed/device_communicators/shm_broadcast.py
index 4ced991f62f66..48ac81ac008b2 100644
--- a/vllm/distributed/device_communicators/shm_broadcast.py
+++ b/vllm/distributed/device_communicators/shm_broadcast.py
@@ -1,3 +1,5 @@
+# SPDX-License-Identifier: Apache-2.0
+
 import os
 import pickle
 import sys
@@ -352,8 +354,8 @@ class MessageQueue:
                     sched_yield()
 
                     # if we wait for a long time, log a message
-                    if (time.monotonic() - start_time >
-                            VLLM_RINGBUFFER_WARNING_INTERVAL * n_warning):
+                    if (time.monotonic() - start_time
+                            > VLLM_RINGBUFFER_WARNING_INTERVAL * n_warning):
                         logger.debug("No available block found in %s second. ",
                                      VLLM_RINGBUFFER_WARNING_INTERVAL)
                         n_warning += 1
@@ -410,8 +412,8 @@ class MessageQueue:
                     sched_yield()
 
                     # if we wait for a long time, log a message
-                    if (time.monotonic() - start_time >
-                            VLLM_RINGBUFFER_WARNING_INTERVAL * n_warning):
+                    if (time.monotonic() - start_time
+                            > VLLM_RINGBUFFER_WARNING_INTERVAL * n_warning):
                         logger.debug("No available block found in %s second. ",
                                      VLLM_RINGBUFFER_WARNING_INTERVAL)
                         n_warning += 1
diff --git a/vllm/distributed/device_communicators/tpu_communicator.py b/vllm/distributed/device_communicators/tpu_communicator.py
index 765a0f9cb1c87..7af7c65f64220 100644
--- a/vllm/distributed/device_communicators/tpu_communicator.py
+++ b/vllm/distributed/device_communicators/tpu_communicator.py
@@ -1,3 +1,5 @@
+# SPDX-License-Identifier: Apache-2.0
+
 import os
 
 import torch
diff --git a/vllm/distributed/device_communicators/xpu_communicator.py b/vllm/distributed/device_communicators/xpu_communicator.py
index eafd3c2f67749..79ccc101e0801 100644
--- a/vllm/distributed/device_communicators/xpu_communicator.py
+++ b/vllm/distributed/device_communicators/xpu_communicator.py
@@ -1,3 +1,5 @@
+# SPDX-License-Identifier: Apache-2.0
+
 import torch
 import torch.distributed as dist
 from torch.distributed import ProcessGroup
diff --git a/vllm/distributed/kv_transfer/kv_connector/base.py b/vllm/distributed/kv_transfer/kv_connector/base.py
index 6089e3babac3e..57c764b481c29 100644
--- a/vllm/distributed/kv_transfer/kv_connector/base.py
+++ b/vllm/distributed/kv_transfer/kv_connector/base.py
@@ -1,3 +1,4 @@
+# SPDX-License-Identifier: Apache-2.0
 """
 KVConnectorBase Class for Distributed KV Cache & Hidden State communication
 
diff --git a/vllm/distributed/kv_transfer/kv_connector/factory.py b/vllm/distributed/kv_transfer/kv_connector/factory.py
index 6372dab726086..fe480533458b8 100644
--- a/vllm/distributed/kv_transfer/kv_connector/factory.py
+++ b/vllm/distributed/kv_transfer/kv_connector/factory.py
@@ -1,3 +1,5 @@
+# SPDX-License-Identifier: Apache-2.0
+
 import importlib
 from typing import TYPE_CHECKING, Callable, Dict, Type
 
diff --git a/vllm/distributed/kv_transfer/kv_connector/simple_connector.py b/vllm/distributed/kv_transfer/kv_connector/simple_connector.py
index 7780e2dfa317d..2033e9762ac0b 100644
--- a/vllm/distributed/kv_transfer/kv_connector/simple_connector.py
+++ b/vllm/distributed/kv_transfer/kv_connector/simple_connector.py
@@ -1,3 +1,4 @@
+# SPDX-License-Identifier: Apache-2.0
 """
 Simple KV Cache Connector for Distributed Machine Learning Inference
 
diff --git a/vllm/distributed/kv_transfer/kv_lookup_buffer/base.py b/vllm/distributed/kv_transfer/kv_lookup_buffer/base.py
index bad119a1aa929..845da7c501e88 100644
--- a/vllm/distributed/kv_transfer/kv_lookup_buffer/base.py
+++ b/vllm/distributed/kv_transfer/kv_lookup_buffer/base.py
@@ -1,3 +1,4 @@
+# SPDX-License-Identifier: Apache-2.0
 """
 This file contains a new class `KVLookupBufferBase` that allows developers to 
 think of KV cache operations as inserting new KV cache entries (`insert`) 
diff --git a/vllm/distributed/kv_transfer/kv_lookup_buffer/simple_buffer.py b/vllm/distributed/kv_transfer/kv_lookup_buffer/simple_buffer.py
index fe8d8d7375f36..5e1b62352d14c 100644
--- a/vllm/distributed/kv_transfer/kv_lookup_buffer/simple_buffer.py
+++ b/vllm/distributed/kv_transfer/kv_lookup_buffer/simple_buffer.py
@@ -1,3 +1,4 @@
+# SPDX-License-Identifier: Apache-2.0
 """
     Implements a distributed key-value (KV) cache transfer mechanism.
 
diff --git a/vllm/distributed/kv_transfer/kv_pipe/base.py b/vllm/distributed/kv_transfer/kv_pipe/base.py
index 4b0cb44cc5b81..40589fb3ef872 100644
--- a/vllm/distributed/kv_transfer/kv_pipe/base.py
+++ b/vllm/distributed/kv_transfer/kv_pipe/base.py
@@ -1,3 +1,4 @@
+# SPDX-License-Identifier: Apache-2.0
 """
 This file defines an interface `KVPipeBase`
 that provides an abstraction for sending and receiving tensors, or None, via
diff --git a/vllm/distributed/kv_transfer/kv_pipe/mooncake_pipe.py b/vllm/distributed/kv_transfer/kv_pipe/mooncake_pipe.py
index 8e4358672b74d..58ab7f0b64243 100644
--- a/vllm/distributed/kv_transfer/kv_pipe/mooncake_pipe.py
+++ b/vllm/distributed/kv_transfer/kv_pipe/mooncake_pipe.py
@@ -1,3 +1,5 @@
+# SPDX-License-Identifier: Apache-2.0
+
 import json
 import os
 import pickle
diff --git a/vllm/distributed/kv_transfer/kv_pipe/pynccl_pipe.py b/vllm/distributed/kv_transfer/kv_pipe/pynccl_pipe.py
index 98222fa67e492..7aa53d07a9ef2 100644
--- a/vllm/distributed/kv_transfer/kv_pipe/pynccl_pipe.py
+++ b/vllm/distributed/kv_transfer/kv_pipe/pynccl_pipe.py
@@ -1,3 +1,4 @@
+# SPDX-License-Identifier: Apache-2.0
 """
     This module implements a PyNccl pipe for sending and receiving 
     Optional[torch.Tensor] between distributed ranks with advanced 
diff --git a/vllm/distributed/kv_transfer/kv_transfer_agent.py b/vllm/distributed/kv_transfer/kv_transfer_agent.py
index 9ce97851dc849..1e80e0bd7de86 100644
--- a/vllm/distributed/kv_transfer/kv_transfer_agent.py
+++ b/vllm/distributed/kv_transfer/kv_transfer_agent.py
@@ -1,3 +1,4 @@
+# SPDX-License-Identifier: Apache-2.0
 """A centralized entrypoint to perform distributed KV cache transfer.
 
 This implementation is a shim wrapper on two APIs exposed by `kv_connector`:
diff --git a/vllm/distributed/parallel_state.py b/vllm/distributed/parallel_state.py
index ffdf8b0f48087..c5c5dfbbab76b 100644
--- a/vllm/distributed/parallel_state.py
+++ b/vllm/distributed/parallel_state.py
@@ -1,3 +1,5 @@
+# SPDX-License-Identifier: Apache-2.0
+
 # Copyright 2023 The vLLM team.
 # Adapted from
 # https://github.com/NVIDIA/Megatron-LM/blob/main/megatron/core/parallel_state.py
@@ -1014,8 +1016,8 @@ def initialize_model_parallel(
     backend = backend or torch.distributed.get_backend(
         get_world_group().device_group)
 
-    if (world_size !=
-            tensor_model_parallel_size * pipeline_model_parallel_size):
+    if (world_size
+            != tensor_model_parallel_size * pipeline_model_parallel_size):
         raise RuntimeError(
             f"world_size ({world_size}) is not equal to "
             f"tensor_model_parallel_size ({tensor_model_parallel_size}) x "
@@ -1069,8 +1071,8 @@ def ensure_kv_transfer_initialized(vllm_config: "VllmConfig") -> None:
         return
 
     if all([
-            vllm_config.kv_transfer_config.need_kv_parallel_group,
-            _KV_TRANSFER is None
+            vllm_config.kv_transfer_config.need_kv_parallel_group, _KV_TRANSFER
+            is None
     ]):
         _KV_TRANSFER = kv_transfer.KVTransferAgent(
             rank=get_world_group().rank,
diff --git a/vllm/distributed/utils.py b/vllm/distributed/utils.py
index dcfcb848cbe06..84f8c0a8e51c5 100644
--- a/vllm/distributed/utils.py
+++ b/vllm/distributed/utils.py
@@ -1,3 +1,5 @@
+# SPDX-License-Identifier: Apache-2.0
+
 # Copyright 2023 The vLLM team.
 # Adapted from
 # https://github.com/NVIDIA/Megatron-LM/blob/main/megatron/core/tensor_parallel/utils.py
diff --git a/vllm/engine/arg_utils.py b/vllm/engine/arg_utils.py
index 8f1b0bc5fd62e..40c6fb4567993 100644
--- a/vllm/engine/arg_utils.py
+++ b/vllm/engine/arg_utils.py
@@ -1,3 +1,5 @@
+# SPDX-License-Identifier: Apache-2.0
+
 import argparse
 import dataclasses
 import json
@@ -11,10 +13,10 @@ import vllm.envs as envs
 from vllm.config import (CacheConfig, CompilationConfig, ConfigFormat,
                          DecodingConfig, DeviceConfig, HfOverrides,
                          KVTransferConfig, LoadConfig, LoadFormat, LoRAConfig,
-                         ModelConfig, ObservabilityConfig, ParallelConfig,
-                         PoolerConfig, PromptAdapterConfig, SchedulerConfig,
-                         SpeculativeConfig, TaskOption, TokenizerPoolConfig,
-                         VllmConfig)
+                         ModelConfig, ModelImpl, ObservabilityConfig,
+                         ParallelConfig, PoolerConfig, PromptAdapterConfig,
+                         SchedulerConfig, SpeculativeConfig, TaskOption,
+                         TokenizerPoolConfig, VllmConfig)
 from vllm.executor.executor_base import ExecutorBase
 from vllm.logger import init_logger
 from vllm.model_executor.layers.quantization import QUANTIZATION_METHODS
@@ -100,7 +102,6 @@ class EngineArgs:
     kv_cache_dtype: str = 'auto'
     seed: int = 0
     max_model_len: Optional[int] = None
-    worker_use_ray: bool = False
     # Note: Specifying a custom executor backend by passing a class
     # is intended for expert use only. The API may change without
     # notice.
@@ -196,7 +197,9 @@ class EngineArgs:
     kv_transfer_config: Optional[KVTransferConfig] = None
 
     generation_config: Optional[str] = None
+    override_generation_config: Optional[Dict[str, Any]] = None
     enable_sleep_mode: bool = False
+    model_impl: str = "auto"
 
     calculate_kv_scales: Optional[bool] = None
 
@@ -376,6 +379,18 @@ class EngineArgs:
             'qualified names that can be passed with the `logits_processors` '
             'extra completion argument. Defaults to None, which allows no '
             'processors.')
+        parser.add_argument(
+            '--model-impl',
+            type=str,
+            default=EngineArgs.model_impl,
+            choices=[f.value for f in ModelImpl],
+            help='Which implementation of the model to use.\n\n'
+            '* "auto" will try to use the vLLM implementation if it exists '
+            'and fall back to the Transformers implementation if no vLLM '
+            'implementation is available.\n'
+            '* "vllm" will use the vLLM model implementation.\n'
+            '* "transformers" will use the Transformers model '
+            'implementation.\n')
         # Parallel arguments
         parser.add_argument(
             '--distributed-executor-backend',
@@ -389,10 +404,6 @@ class EngineArgs:
             'to "ray" if Ray is installed and fail otherwise. Note that tpu '
             'only supports Ray for distributed inference.')
 
-        parser.add_argument(
-            '--worker-use-ray',
-            action='store_true',
-            help='Deprecated, use ``--distributed-executor-backend=ray``.')
         parser.add_argument('--pipeline-parallel-size',
                             '-pp',
                             type=int,
@@ -935,16 +946,28 @@ class EngineArgs:
             type=str,
             default="auto",
             help='The worker class to use for distributed execution.')
-
         parser.add_argument(
             "--generation-config",
             type=nullable_str,
             default=None,
             help="The folder path to the generation config. "
-            "Defaults to None, will use the default generation config in vLLM. "
-            "If set to 'auto', the generation config will be automatically "
-            "loaded from model. If set to a folder path, the generation config "
-            "will be loaded from the specified folder path.")
+            "Defaults to None, no generation config is loaded, vLLM defaults "
+            "will be used. If set to 'auto', the generation config will be "
+            "loaded from model path. If set to a folder path, the generation "
+            "config will be loaded from the specified folder path. If "
+            "`max_new_tokens` is specified in generation config, then "
+            "it sets a server-wide limit on the number of output tokens "
+            "for all requests.")
+
+        parser.add_argument(
+            "--override-generation-config",
+            type=json.loads,
+            default=None,
+            help="Overrides or sets generation config in JSON format. "
+            "e.g. ``{\"temperature\": 0.5}``. If used with "
+            "--generation-config=auto, the override parameters will be merged "
+            "with the default config from the model. If generation-config is "
+            "None, only the override parameters are used.")
 
         parser.add_argument("--enable-sleep-mode",
                             action="store_true",
@@ -1005,7 +1028,9 @@ class EngineArgs:
             override_pooler_config=self.override_pooler_config,
             logits_processor_pattern=self.logits_processor_pattern,
             generation_config=self.generation_config,
+            override_generation_config=self.override_generation_config,
             enable_sleep_mode=self.enable_sleep_mode,
+            model_impl=self.model_impl,
         )
 
     def create_load_config(self) -> LoadConfig:
@@ -1071,7 +1096,6 @@ class EngineArgs:
         parallel_config = ParallelConfig(
             pipeline_parallel_size=self.pipeline_parallel_size,
             tensor_parallel_size=self.tensor_parallel_size,
-            worker_use_ray=self.worker_use_ray,
             max_parallel_loading_workers=self.max_parallel_loading_workers,
             disable_custom_all_reduce=self.disable_custom_all_reduce,
             tokenizer_pool_config=TokenizerPoolConfig.create_config(
diff --git a/vllm/engine/async_llm_engine.py b/vllm/engine/async_llm_engine.py
index 739ea06ae3818..053635a286383 100644
--- a/vllm/engine/async_llm_engine.py
+++ b/vllm/engine/async_llm_engine.py
@@ -1,3 +1,5 @@
+# SPDX-License-Identifier: Apache-2.0
+
 import asyncio
 import copy
 import time
diff --git a/vllm/engine/async_timeout.py b/vllm/engine/async_timeout.py
index 4b18426252127..aa54c0693941f 100644
--- a/vllm/engine/async_timeout.py
+++ b/vllm/engine/async_timeout.py
@@ -1,3 +1,5 @@
+# SPDX-License-Identifier: Apache-2.0
+
 # Workaround for https://github.com/python/cpython/issues/86296
 #
 # From https://github.com/aio-libs/async-timeout/blob/master/async_timeout/__init__.py
diff --git a/vllm/engine/llm_engine.py b/vllm/engine/llm_engine.py
index 7da18d5f7d2eb..d82d9ad9df323 100644
--- a/vllm/engine/llm_engine.py
+++ b/vllm/engine/llm_engine.py
@@ -1,3 +1,5 @@
+# SPDX-License-Identifier: Apache-2.0
+
 import copy
 import time
 from collections import Counter as collectionsCounter
@@ -230,7 +232,7 @@ class LLMEngine:
         )
 
         logger.info(
-            "Initializing an LLM engine (v%s) with config: %s, "
+            "Initializing a V0 LLM engine (v%s) with config: %s, "
             "use_cached_outputs=%s, ",
             VLLM_VERSION,
             vllm_config,
@@ -1010,8 +1012,23 @@ class LLMEngine:
                      self.speculative_config
             # Organize outputs by [step][sequence group] instead of
             # [sequence group][step].
-            outputs_by_sequence_group = create_output_by_sequence_group(
-                outputs, num_seq_groups=len(seq_group_metadata_list))
+            if self.scheduler_config.is_multi_step:
+                outputs_by_sequence_group = create_output_by_sequence_group(
+                    outputs, len(seq_group_metadata_list))
+            elif self.speculative_config:
+                # Decodes are multi-steps while prefills are not, outputting at
+                # most 1 token. Separate them so that we can trigger chunk
+                # processing without having to pad or copy over prompts K times
+                # to match decodes structure (costly with prompt_logprobs).
+                num_prefills = sum(sg.is_prompt
+                                   for sg in seq_group_metadata_list)
+                prefills, decodes = outputs[:num_prefills], outputs[
+                    num_prefills:]
+                outputs_by_sequence_group = create_output_by_sequence_group(
+                    decodes,
+                    num_seq_groups=len(seq_group_metadata_list) - num_prefills)
+                outputs_by_sequence_group = [p.outputs for p in prefills
+                                             ] + outputs_by_sequence_group
             # We have outputs for multiple steps submitted in a single burst,
             # so invalidate is_first_step_output.
             is_first_step_output = None
diff --git a/vllm/engine/metrics.py b/vllm/engine/metrics.py
index f7ce21d0ae988..ce806b4a937a1 100644
--- a/vllm/engine/metrics.py
+++ b/vllm/engine/metrics.py
@@ -1,3 +1,5 @@
+# SPDX-License-Identifier: Apache-2.0
+
 import time
 from typing import TYPE_CHECKING
 from typing import Counter as CollectionsCounter
@@ -259,21 +261,6 @@ class Metrics:
             documentation="Number of emitted tokens.",
             labelnames=labelnames))
 
-        # Deprecated in favor of vllm:prompt_tokens_total
-        self.gauge_avg_prompt_throughput = self._gauge_cls(
-            name="vllm:avg_prompt_throughput_toks_per_s",
-            documentation="Average prefill throughput in tokens/s.",
-            labelnames=labelnames,
-            multiprocess_mode="sum",
-        )
-        # Deprecated in favor of vllm:generation_tokens_total
-        self.gauge_avg_generation_throughput = self._gauge_cls(
-            name="vllm:avg_generation_throughput_toks_per_s",
-            documentation="Average generation throughput in tokens/s.",
-            labelnames=labelnames,
-            multiprocess_mode="sum",
-        )
-
 
 # end-metrics-definitions
 
@@ -635,20 +622,6 @@ class PrometheusStatLogger(StatLoggerBase):
         self._log_histogram(self.metrics.histogram_max_tokens_request,
                             stats.max_tokens_requests)
 
-    def _log_prometheus_interval(self, prompt_throughput: float,
-                                 generation_throughput: float) -> None:
-        # Logs metrics to prometheus that are computed every logging_interval.
-        # Support legacy gauge metrics that make throughput calculations on
-        # the vLLM side. Moving forward, we should use counters like
-        # counter_prompt_tokens, counter_generation_tokens
-        # Which log raw data and calculate summaries using rate() on the
-        # grafana/prometheus side. See
-        # https://github.com/vllm-project/vllm/pull/2316#discussion_r1464204666
-        self.metrics.gauge_avg_prompt_throughput.labels(
-            **self.labels).set(prompt_throughput)
-        self.metrics.gauge_avg_generation_throughput.labels(
-            **self.labels).set(generation_throughput)
-
     def log(self, stats: Stats):
         """Logs to prometheus and tracked stats every iteration."""
         # Log to prometheus.
@@ -664,20 +637,6 @@ class PrometheusStatLogger(StatLoggerBase):
         # Log locally every local_interval seconds.
         if local_interval_elapsed(stats.now, self.last_local_log,
                                   self.local_interval):
-            # Compute summary metrics for tracked stats (and log them
-            # to promethus if applicable).
-            prompt_throughput = get_throughput(self.num_prompt_tokens,
-                                               now=stats.now,
-                                               last_log=self.last_local_log)
-            generation_throughput = get_throughput(
-                self.num_generation_tokens,
-                now=stats.now,
-                last_log=self.last_local_log)
-
-            self._log_prometheus_interval(
-                prompt_throughput=prompt_throughput,
-                generation_throughput=generation_throughput)
-
             if self.spec_decode_metrics is not None:
                 self._log_gauge(
                     self.metrics.gauge_spec_decode_draft_acceptance_rate,
diff --git a/vllm/engine/metrics_types.py b/vllm/engine/metrics_types.py
index 5c7a430d11c5a..7f0c2fa70c3f9 100644
--- a/vllm/engine/metrics_types.py
+++ b/vllm/engine/metrics_types.py
@@ -1,3 +1,4 @@
+# SPDX-License-Identifier: Apache-2.0
 """
 These types are defined in this file to avoid importing vllm.engine.metrics
 and therefore importing prometheus_client.
diff --git a/vllm/engine/multiprocessing/__init__.py b/vllm/engine/multiprocessing/__init__.py
index d9703b820a779..3cf1850ee65ad 100644
--- a/vllm/engine/multiprocessing/__init__.py
+++ b/vllm/engine/multiprocessing/__init__.py
@@ -1,3 +1,5 @@
+# SPDX-License-Identifier: Apache-2.0
+
 import uuid
 from dataclasses import dataclass, field
 from enum import Enum
diff --git a/vllm/engine/multiprocessing/client.py b/vllm/engine/multiprocessing/client.py
index 5237f63c34c01..85b5f31e3a4aa 100644
--- a/vllm/engine/multiprocessing/client.py
+++ b/vllm/engine/multiprocessing/client.py
@@ -1,3 +1,5 @@
+# SPDX-License-Identifier: Apache-2.0
+
 import asyncio
 import copy
 import pickle
diff --git a/vllm/engine/multiprocessing/engine.py b/vllm/engine/multiprocessing/engine.py
index 166f89743b3cd..a0dd79586588e 100644
--- a/vllm/engine/multiprocessing/engine.py
+++ b/vllm/engine/multiprocessing/engine.py
@@ -1,3 +1,5 @@
+# SPDX-License-Identifier: Apache-2.0
+
 import pickle
 import signal
 from contextlib import contextmanager
diff --git a/vllm/engine/output_processor/interfaces.py b/vllm/engine/output_processor/interfaces.py
index 50adaf4e59188..4c8e295c13815 100644
--- a/vllm/engine/output_processor/interfaces.py
+++ b/vllm/engine/output_processor/interfaces.py
@@ -1,3 +1,5 @@
+# SPDX-License-Identifier: Apache-2.0
+
 from abc import ABC, abstractmethod
 from typing import Callable, List
 
diff --git a/vllm/engine/output_processor/multi_step.py b/vllm/engine/output_processor/multi_step.py
index 99c2baf3f4df4..8ceef855e020f 100644
--- a/vllm/engine/output_processor/multi_step.py
+++ b/vllm/engine/output_processor/multi_step.py
@@ -1,3 +1,5 @@
+# SPDX-License-Identifier: Apache-2.0
+
 import functools
 from typing import Callable, List, cast
 
diff --git a/vllm/engine/output_processor/single_step.py b/vllm/engine/output_processor/single_step.py
index 55c56abea0da3..4d96791a1f8a3 100644
--- a/vllm/engine/output_processor/single_step.py
+++ b/vllm/engine/output_processor/single_step.py
@@ -1,3 +1,5 @@
+# SPDX-License-Identifier: Apache-2.0
+
 from typing import List
 
 from vllm.config import SchedulerConfig
diff --git a/vllm/engine/output_processor/stop_checker.py b/vllm/engine/output_processor/stop_checker.py
index 4b701f81504bb..3bca0bee35a4c 100644
--- a/vllm/engine/output_processor/stop_checker.py
+++ b/vllm/engine/output_processor/stop_checker.py
@@ -1,3 +1,5 @@
+# SPDX-License-Identifier: Apache-2.0
+
 from typing import Callable, List, Optional, Tuple
 
 from vllm.lora.request import LoRARequest
diff --git a/vllm/engine/output_processor/util.py b/vllm/engine/output_processor/util.py
index 770982a207e6c..0d2b58c109e32 100644
--- a/vllm/engine/output_processor/util.py
+++ b/vllm/engine/output_processor/util.py
@@ -1,3 +1,5 @@
+# SPDX-License-Identifier: Apache-2.0
+
 from typing import List
 from typing import Sequence as GenericSequence
 from typing import cast
diff --git a/vllm/engine/protocol.py b/vllm/engine/protocol.py
index de7b2c1b91f50..d1112558666ff 100644
--- a/vllm/engine/protocol.py
+++ b/vllm/engine/protocol.py
@@ -1,3 +1,5 @@
+# SPDX-License-Identifier: Apache-2.0
+
 import asyncio
 from abc import ABC, abstractmethod
 from typing import AsyncGenerator, List, Mapping, Optional
diff --git a/vllm/entrypoints/api_server.py b/vllm/entrypoints/api_server.py
index daefbff7e5178..96818507d589f 100644
--- a/vllm/entrypoints/api_server.py
+++ b/vllm/entrypoints/api_server.py
@@ -1,3 +1,4 @@
+# SPDX-License-Identifier: Apache-2.0
 """
 NOTE: This API server is used only for demonstrating usage of AsyncEngine
 and simple performance benchmarks. It is not intended for production use.
diff --git a/vllm/entrypoints/chat_utils.py b/vllm/entrypoints/chat_utils.py
index beedf5d16ab86..3a6e75b1d8e58 100644
--- a/vllm/entrypoints/chat_utils.py
+++ b/vllm/entrypoints/chat_utils.py
@@ -1,9 +1,11 @@
+# SPDX-License-Identifier: Apache-2.0
+
 import asyncio
 import codecs
 import json
 from abc import ABC, abstractmethod
 from collections import defaultdict, deque
-from functools import lru_cache, partial
+from functools import cache, lru_cache, partial
 from pathlib import Path
 from typing import (Any, Awaitable, Callable, Dict, Generic, Iterable, List,
                     Literal, Optional, Tuple, TypeVar, Union, cast)
@@ -377,7 +379,7 @@ class BaseMultiModalItemTracker(ABC, Generic[_T]):
         return self._model_config.allowed_local_media_path
 
     @staticmethod
-    @lru_cache(maxsize=None)
+    @cache
     def _cached_token_str(tokenizer: AnyTokenizer, token_index: int) -> str:
         return tokenizer.decode(token_index)
 
@@ -392,7 +394,7 @@ class BaseMultiModalItemTracker(ABC, Generic[_T]):
             if model_type == "phi3_v":
                 # Workaround since this token is not defined in the tokenizer
                 return f"<|image_{current_count}|>"
-            if model_type == "minicpmv":
+            if model_type in ("minicpmo", "minicpmv"):
                 return "(<image>./</image>)"
             if model_type in ("blip-2", "chatglm", "fuyu", "paligemma",
                               "pixtral"):
@@ -424,10 +426,14 @@ class BaseMultiModalItemTracker(ABC, Generic[_T]):
             if model_type == "qwen2_audio":
                 return (f"Audio {current_count}: "
                         f"<|audio_bos|><|AUDIO|><|audio_eos|>")
+            if model_type == "minicpmo":
+                return "(<audio>./</audio>)"
             raise TypeError(f"Unknown model type: {model_type}")
         elif modality == "video":
             if model_type == "qwen2_vl":
                 return "<|vision_start|><|video_pad|><|vision_end|>"
+            if model_type in ("minicpmo", "minicpmv"):
+                return "(<video>./</video>)"
             if model_type.startswith("llava"):
                 return self._cached_token_str(self._tokenizer,
                                               hf_config.video_token_index)
diff --git a/vllm/entrypoints/launcher.py b/vllm/entrypoints/launcher.py
index 5dcf50bd1b0a1..351a39525fa62 100644
--- a/vllm/entrypoints/launcher.py
+++ b/vllm/entrypoints/launcher.py
@@ -1,3 +1,5 @@
+# SPDX-License-Identifier: Apache-2.0
+
 import asyncio
 import signal
 from http import HTTPStatus
diff --git a/vllm/entrypoints/llm.py b/vllm/entrypoints/llm.py
index 1860ed3d7db5a..d071a0b3cfc5d 100644
--- a/vllm/entrypoints/llm.py
+++ b/vllm/entrypoints/llm.py
@@ -1,3 +1,5 @@
+# SPDX-License-Identifier: Apache-2.0
+
 import itertools
 import warnings
 from contextlib import contextmanager
@@ -5,6 +7,7 @@ from typing import (Any, Callable, ClassVar, Dict, List, Optional, Sequence,
                     Tuple, Type, Union, cast, overload)
 
 import cloudpickle
+import torch
 import torch.nn as nn
 from tqdm import tqdm
 from typing_extensions import TypeVar, deprecated
@@ -996,6 +999,107 @@ class LLM:
 
         return [ClassificationRequestOutput.from_base(item) for item in items]
 
+    def _embedding_score(
+        self,
+        tokenizer: AnyTokenizer,
+        text_1: List[Union[str, TextPrompt, TokensPrompt]],
+        text_2: List[Union[str, TextPrompt, TokensPrompt]],
+        truncate_prompt_tokens: Optional[int] = None,
+        use_tqdm: bool = True,
+        lora_request: Optional[Union[List[LoRARequest], LoRARequest]] = None,
+        prompt_adapter_request: Optional[PromptAdapterRequest] = None,
+    ) -> List[ScoringRequestOutput]:
+
+        encoded_output = self.encode(
+            text_1 + text_2,
+            use_tqdm=use_tqdm,
+            lora_request=lora_request,
+            prompt_adapter_request=prompt_adapter_request)
+        encoded_output_1 = encoded_output[0:len(text_1)]
+        encoded_output_2 = encoded_output[len(text_1):]
+
+        if len(encoded_output_1) == 1:
+            encoded_output_1 = encoded_output_1 * len(encoded_output_2)
+
+        output_pairs = [(t1, t2)
+                        for t1, t2 in zip(encoded_output_1, encoded_output_2)]
+
+        scores = []
+        scorer = torch.nn.CosineSimilarity(0)
+
+        for embed_1, embed_2 in output_pairs:
+            pair_score = scorer(embed_1.outputs.data, embed_2.outputs.data)
+
+            if (pad_token_id := getattr(tokenizer, "pad_token_id",
+                                        None)) is not None:
+                tokens = embed_1.prompt_token_ids + [
+                    pad_token_id
+                ] + embed_2.prompt_token_ids
+            else:
+                tokens = embed_1.prompt_token_ids + embed_2.prompt_token_ids
+
+            scores.append(
+                PoolingRequestOutput(
+                    request_id=f"{embed_1.request_id}_{embed_2.request_id}",
+                    outputs=pair_score,
+                    prompt_token_ids=tokens,
+                    finished=True))
+
+        items = self.engine_class.validate_outputs(scores,
+                                                   PoolingRequestOutput)
+        return [ScoringRequestOutput.from_base(item) for item in items]
+
+    def _cross_encoding_score(
+        self,
+        tokenizer: Union[AnyTokenizer],
+        text_1: List[Union[str, TextPrompt, TokensPrompt]],
+        text_2: List[Union[str, TextPrompt, TokensPrompt]],
+        truncate_prompt_tokens: Optional[int] = None,
+        use_tqdm: bool = True,
+        lora_request: Optional[Union[List[LoRARequest], LoRARequest]] = None,
+        prompt_adapter_request: Optional[PromptAdapterRequest] = None,
+    ) -> List[ScoringRequestOutput]:
+
+        if isinstance(tokenizer, MistralTokenizer):
+            raise ValueError(
+                "Score API is only enabled for `--task embed or score`")
+
+        if len(text_1) == 1:
+            text_1 = text_1 * len(text_2)
+
+        input_pairs = [(t1, t2) for t1, t2 in zip(text_1, text_2)]
+
+        pooling_params = PoolingParams()
+
+        tokenization_kwargs: Dict[str, Any] = {}
+        if truncate_prompt_tokens is not None:
+            tokenization_kwargs["truncation"] = True
+            tokenization_kwargs["max_length"] = truncate_prompt_tokens
+
+        parsed_prompts = []
+
+        for q, t in input_pairs:
+            prompt_inputs = tokenizer(text=q,
+                                      text_pair=t,
+                                      **tokenization_kwargs)
+            engine_prompt = TokensPrompt(
+                prompt_token_ids=prompt_inputs["input_ids"],
+                token_type_ids=prompt_inputs.get("token_type_ids"))
+            parsed_prompts.append(engine_prompt)
+
+        self._validate_and_add_requests(
+            prompts=parsed_prompts,
+            params=pooling_params,
+            lora_request=lora_request,
+            prompt_adapter_request=prompt_adapter_request,
+        )
+
+        outputs = self._run_engine(use_tqdm=use_tqdm)
+        items = self.engine_class.validate_outputs(outputs,
+                                                   PoolingRequestOutput)
+
+        return [ScoringRequestOutput.from_base(item) for item in items]
+
     def score(
         self,
         text_1: Union[SingletonPrompt, Sequence[SingletonPrompt]],
@@ -1047,25 +1151,20 @@ class LLM:
 
             raise ValueError(" ".join(messages))
 
-        if not self.llm_engine.model_config.is_cross_encoder:
-            raise ValueError("Your model does not support cross encoding")
-        if self.llm_engine.model_config.task != "score":
-            raise ValueError("Score API is only enabled for `--task score`")
-
-        tokenizer = self.llm_engine.get_tokenizer()
-
-        if isinstance(tokenizer, MistralTokenizer):
+        if self.llm_engine.model_config.task not in ("embed", "score"):
             raise ValueError(
-                "MistralTokenizer not supported for cross-encoding")
+                "Score API is only enabled for `--task embed or --task score`")
 
         # the tokenizer for models such as
         # "cross-encoder/ms-marco-MiniLM-L-6-v2" doesn't support passing
         # lists of tokens to the `text` and `text_pair` kwargs
+        tokenizer = self.llm_engine.get_tokenizer()
+
         def ensure_str(prompt: SingletonPrompt):
             if isinstance(prompt, dict):
                 if "multi_modal_data" in prompt:
                     raise ValueError("Multi-modal prompt is not "
-                                     "supported for cross encoding")
+                                     "supported for scoring")
                 elif "prompt_token_ids" in prompt:
                     prompt = tokenizer.decode(
                         cast(TokensPrompt, prompt)["prompt_token_ids"])
@@ -1091,40 +1190,15 @@ class LLM:
         if len(text_2) == 0:
             raise ValueError("At least one text_pair element must be given")
 
-        if len(text_1) == 1:
-            text_1 = text_1 * len(text_2)
-
-        input_pairs = [(t1, t2) for t1, t2 in zip(text_1, text_2)]
-        pooling_params = PoolingParams()
-
-        tokenization_kwargs: Dict[str, Any] = {}
-        if truncate_prompt_tokens is not None:
-            tokenization_kwargs["truncation"] = True
-            tokenization_kwargs["max_length"] = truncate_prompt_tokens
-
-        parsed_prompts = []
-
-        for q, t in input_pairs:
-            prompt_inputs = tokenizer(text=q,
-                                      text_pair=t,
-                                      **tokenization_kwargs)
-            engine_prompt = TokensPrompt(
-                prompt_token_ids=prompt_inputs["input_ids"],
-                token_type_ids=prompt_inputs.get("token_type_ids"))
-            parsed_prompts.append(engine_prompt)
-
-        self._validate_and_add_requests(
-            prompts=parsed_prompts,
-            params=pooling_params,
-            lora_request=lora_request,
-            prompt_adapter_request=prompt_adapter_request,
-        )
-
-        outputs = self._run_engine(use_tqdm=use_tqdm)
-        items = self.engine_class.validate_outputs(outputs,
-                                                   PoolingRequestOutput)
-
-        return [ScoringRequestOutput.from_base(item) for item in items]
+        if self.llm_engine.model_config.is_cross_encoder:
+            return self._cross_encoding_score(tokenizer, text_1, text_2,
+                                              truncate_prompt_tokens, use_tqdm,
+                                              lora_request,
+                                              prompt_adapter_request)
+        else:
+            return self._embedding_score(tokenizer, text_1, text_2,
+                                         truncate_prompt_tokens, use_tqdm,
+                                         lora_request, prompt_adapter_request)
 
     def start_profile(self) -> None:
         self.llm_engine.start_profile()
diff --git a/vllm/entrypoints/logger.py b/vllm/entrypoints/logger.py
index 584ee0d9e1c54..e82b6ba6c7bae 100644
--- a/vllm/entrypoints/logger.py
+++ b/vllm/entrypoints/logger.py
@@ -1,3 +1,5 @@
+# SPDX-License-Identifier: Apache-2.0
+
 from typing import List, Optional, Union
 
 from vllm.logger import init_logger
diff --git a/vllm/entrypoints/openai/api_server.py b/vllm/entrypoints/openai/api_server.py
index f510c41503011..b8f54d6c78042 100644
--- a/vllm/entrypoints/openai/api_server.py
+++ b/vllm/entrypoints/openai/api_server.py
@@ -1,3 +1,5 @@
+# SPDX-License-Identifier: Apache-2.0
+
 import asyncio
 import atexit
 import gc
@@ -56,10 +58,12 @@ from vllm.entrypoints.openai.protocol import (ChatCompletionRequest,
                                               PoolingChatRequest,
                                               PoolingCompletionRequest,
                                               PoolingRequest, PoolingResponse,
+                                              RerankRequest, RerankResponse,
                                               ScoreRequest, ScoreResponse,
                                               TokenizeRequest,
                                               TokenizeResponse,
                                               UnloadLoraAdapterRequest)
+from vllm.entrypoints.openai.reasoning_parsers import ReasoningParserManager
 # yapf: enable
 from vllm.entrypoints.openai.serving_chat import OpenAIServingChat
 from vllm.entrypoints.openai.serving_completion import OpenAIServingCompletion
@@ -68,6 +72,7 @@ from vllm.entrypoints.openai.serving_engine import OpenAIServing
 from vllm.entrypoints.openai.serving_models import (BaseModelPath,
                                                     OpenAIServingModels)
 from vllm.entrypoints.openai.serving_pooling import OpenAIServingPooling
+from vllm.entrypoints.openai.serving_rerank import JinaAIServingRerank
 from vllm.entrypoints.openai.serving_score import OpenAIServingScores
 from vllm.entrypoints.openai.serving_tokenization import (
     OpenAIServingTokenization)
@@ -306,6 +311,10 @@ def score(request: Request) -> Optional[OpenAIServingScores]:
     return request.app.state.openai_serving_scores
 
 
+def rerank(request: Request) -> Optional[JinaAIServingRerank]:
+    return request.app.state.jinaai_serving_reranking
+
+
 def tokenization(request: Request) -> OpenAIServingTokenization:
     return request.app.state.openai_serving_tokenization
 
@@ -502,6 +511,40 @@ async def create_score_v1(request: ScoreRequest, raw_request: Request):
     return await create_score(request, raw_request)
 
 
+@router.post("/rerank")
+@with_cancellation
+async def do_rerank(request: RerankRequest, raw_request: Request):
+    handler = rerank(raw_request)
+    if handler is None:
+        return base(raw_request).create_error_response(
+            message="The model does not support Rerank (Score) API")
+    generator = await handler.do_rerank(request, raw_request)
+    if isinstance(generator, ErrorResponse):
+        return JSONResponse(content=generator.model_dump(),
+                            status_code=generator.code)
+    elif isinstance(generator, RerankResponse):
+        return JSONResponse(content=generator.model_dump())
+
+    assert_never(generator)
+
+
+@router.post("/v1/rerank")
+@with_cancellation
+async def do_rerank_v1(request: RerankRequest, raw_request: Request):
+    logger.warning_once(
+        "To indicate that the rerank API is not part of the standard OpenAI"
+        " API, we have located it at `/rerank`. Please update your client"
+        "accordingly. (Note: Conforms to JinaAI rerank API)")
+
+    return await do_rerank(request, raw_request)
+
+
+@router.post("/v2/rerank")
+@with_cancellation
+async def do_rerank_v2(request: RerankRequest, raw_request: Request):
+    return await do_rerank(request, raw_request)
+
+
 TASK_HANDLERS: Dict[str, Dict[str, tuple]] = {
     "generate": {
         "messages": (ChatCompletionRequest, create_chat_completion),
@@ -512,7 +555,10 @@ TASK_HANDLERS: Dict[str, Dict[str, tuple]] = {
         "default": (EmbeddingCompletionRequest, create_embedding),
     },
     "score": {
-        "default": (ScoreRequest, create_score),
+        "default": (RerankRequest, do_rerank)
+    },
+    "rerank": {
+        "default": (RerankRequest, do_rerank)
     },
     "reward": {
         "messages": (PoolingChatRequest, create_pooling),
@@ -728,6 +774,8 @@ async def init_app_state(
         return_tokens_as_token_ids=args.return_tokens_as_token_ids,
         enable_auto_tools=args.enable_auto_tool_choice,
         tool_parser=args.tool_call_parser,
+        enable_reasoning=args.enable_reasoning,
+        reasoning_parser=args.reasoning_parser,
         enable_prompt_tokens_details=args.enable_prompt_tokens_details,
     ) if model_config.runner_type == "generate" else None
     state.openai_serving_completion = OpenAIServingCompletion(
@@ -759,6 +807,12 @@ async def init_app_state(
         state.openai_serving_models,
         request_logger=request_logger
     ) if model_config.task == "score" else None
+    state.jinaai_serving_reranking = JinaAIServingRerank(
+        engine_client,
+        model_config,
+        state.openai_serving_models,
+        request_logger=request_logger
+    ) if model_config.task == "score" else None
     state.openai_serving_tokenization = OpenAIServingTokenization(
         engine_client,
         model_config,
@@ -795,6 +849,13 @@ async def run_server(args, **uvicorn_kwargs) -> None:
         raise KeyError(f"invalid tool call parser: {args.tool_call_parser} "
                        f"(chose from {{ {','.join(valid_tool_parses)} }})")
 
+    valid_reasoning_parses = ReasoningParserManager.reasoning_parsers.keys()
+    if args.enable_reasoning \
+        and args.reasoning_parser not in valid_reasoning_parses:
+        raise KeyError(
+            f"invalid reasoning parser: {args.reasoning_parser} "
+            f"(chose from {{ {','.join(valid_reasoning_parses)} }})")
+
     # workaround to make sure that we bind the port before the engine is set up.
     # This avoids race conditions with ray.
     # see https://github.com/vllm-project/vllm/issues/8204
diff --git a/vllm/entrypoints/openai/cli_args.py b/vllm/entrypoints/openai/cli_args.py
index 4df75a665bab9..3054958f3c8ab 100644
--- a/vllm/entrypoints/openai/cli_args.py
+++ b/vllm/entrypoints/openai/cli_args.py
@@ -1,3 +1,4 @@
+# SPDX-License-Identifier: Apache-2.0
 """
 This file contains the command line arguments for the vLLM's
 OpenAI-compatible server. It is kept in a separate file for documentation
@@ -12,6 +13,7 @@ from typing import List, Optional, Sequence, Union, get_args
 from vllm.engine.arg_utils import AsyncEngineArgs, nullable_str
 from vllm.entrypoints.chat_utils import (ChatTemplateContentFormatOption,
                                          validate_chat_template)
+from vllm.entrypoints.openai.reasoning_parsers import ReasoningParserManager
 from vllm.entrypoints.openai.serving_models import (LoRAModulePath,
                                                     PromptAdapterPath)
 from vllm.entrypoints.openai.tool_parsers import ToolParserManager
@@ -208,6 +210,23 @@ def make_arg_parser(parser: FlexibleArgumentParser) -> FlexibleArgumentParser:
         default=False,
         help="Enable auto tool choice for supported models. Use "
         "``--tool-call-parser`` to specify which parser to use.")
+    parser.add_argument(
+        "--enable-reasoning",
+        action="store_true",
+        default=False,
+        help="Whether to enable reasoning_content for the model. "
+        "If enabled, the model will be able to generate reasoning content.")
+
+    valid_reasoning_parsers = ReasoningParserManager.reasoning_parsers.keys()
+    parser.add_argument(
+        "--reasoning-parser",
+        type=str,
+        metavar="{" + ",".join(valid_reasoning_parsers) + "}",
+        default=None,
+        help=
+        "Select the reasoning parser depending on the model that you're using."
+        " This is used to parse the reasoning content into OpenAI API "
+        "format. Required for ``--enable-reasoning``.")
 
     valid_tool_parsers = ToolParserManager.tool_parsers.keys()
     parser.add_argument(
@@ -267,6 +286,18 @@ def validate_parsed_serve_args(args: argparse.Namespace):
         raise TypeError("Error: --enable-auto-tool-choice requires "
                         "--tool-call-parser")
 
+    # Enable reasoning needs a reasoning parser to be valid
+    if args.enable_reasoning and not args.reasoning_parser:
+        raise TypeError("Error: --enable-reasoning requires "
+                        "--reasoning-parser")
+
+    # Ref https://api-docs.deepseek.com/guides/reasoning_model
+    # tool call and reasoning cannot be enabled at the same time.
+    if args.enable_auto_tool_choice and args.enable_reasoning:
+        raise TypeError(
+            "Error: --enable-auto-tool-choice and "
+            "--enable-reasoning cannot be enabled at the same time")
+
 
 def create_parser_for_docs() -> FlexibleArgumentParser:
     parser_for_docs = FlexibleArgumentParser(
diff --git a/vllm/entrypoints/openai/logits_processors.py b/vllm/entrypoints/openai/logits_processors.py
index c8132811de903..41e5eef40eaf8 100644
--- a/vllm/entrypoints/openai/logits_processors.py
+++ b/vllm/entrypoints/openai/logits_processors.py
@@ -1,3 +1,5 @@
+# SPDX-License-Identifier: Apache-2.0
+
 from functools import lru_cache, partial
 from typing import Dict, FrozenSet, Iterable, List, Optional, Union
 
diff --git a/vllm/entrypoints/openai/protocol.py b/vllm/entrypoints/openai/protocol.py
index 80403f77d5375..83b841826231e 100644
--- a/vllm/entrypoints/openai/protocol.py
+++ b/vllm/entrypoints/openai/protocol.py
@@ -1,3 +1,5 @@
+# SPDX-License-Identifier: Apache-2.0
+
 # Adapted from
 # https://github.com/lm-sys/FastChat/blob/168ccc29d3f7edc50823016105c024fe2282732a/fastchat/protocol/openai_api_protocol.py
 import re
@@ -6,7 +8,8 @@ from argparse import Namespace
 from typing import Any, ClassVar, Dict, List, Literal, Optional, Set, Union
 
 import torch
-from pydantic import BaseModel, ConfigDict, Field, model_validator
+from pydantic import (BaseModel, ConfigDict, Field, TypeAdapter,
+                      ValidationInfo, field_validator, model_validator)
 from typing_extensions import Annotated
 
 from vllm.entrypoints.chat_utils import ChatCompletionMessageParam
@@ -45,14 +48,14 @@ class OpenAIBaseModel(BaseModel):
     # Cache class field names
     field_names: ClassVar[Optional[Set[str]]] = None
 
-    @model_validator(mode="before")
+    @model_validator(mode="wrap")
     @classmethod
-    def __log_extra_fields__(cls, data):
-
+    def __log_extra_fields__(cls, data, handler):
+        result = handler(data)
+        if not isinstance(data, dict):
+            return result
         field_names = cls.field_names
         if field_names is None:
-            if not isinstance(data, dict):
-                return data
             # Get all class field names and their potential aliases
             field_names = set()
             for field_name, field in cls.model_fields.items():
@@ -67,7 +70,7 @@ class OpenAIBaseModel(BaseModel):
                 "The following fields were present in the request "
                 "but ignored: %s",
                 data.keys() - field_names)
-        return data
+        return result
 
 
 class ErrorResponse(OpenAIBaseModel):
@@ -380,13 +383,17 @@ class ChatCompletionRequest(OpenAIBaseModel):
     ) -> BeamSearchParams:
         # TODO(#9845): remove max_tokens when field is removed from OpenAI API
         max_tokens = self.max_completion_tokens or self.max_tokens
-        if max_tokens is None:
-            max_tokens = default_max_tokens
 
         if default_sampling_params is None:
             default_sampling_params = {}
         n = self.n if self.n is not None else 1
 
+        # Use minimum of context window, user request & server limit.
+        max_tokens = min(
+            val for val in (default_max_tokens, max_tokens,
+                            default_sampling_params.get("max_tokens", None))
+            if val is not None)
+
         if (temperature := self.temperature) is None:
             temperature = default_sampling_params.get(
                 "temperature", self._DEFAULT_SAMPLING_PARAMS["temperature"])
@@ -406,11 +413,16 @@ class ChatCompletionRequest(OpenAIBaseModel):
             default_sampling_params: Optional[dict] = None) -> SamplingParams:
         # TODO(#9845): remove max_tokens when field is removed from OpenAI API
         max_tokens = self.max_completion_tokens or self.max_tokens
-        if max_tokens is None:
-            max_tokens = default_max_tokens
 
         if default_sampling_params is None:
             default_sampling_params = {}
+
+        # Use minimum of context window, user request & server limit.
+        max_tokens = min(
+            val for val in (default_max_tokens, max_tokens,
+                            default_sampling_params.get("max_tokens", None))
+            if val is not None)
+
         # Default parameters
         if (repetition_penalty := self.repetition_penalty) is None:
             repetition_penalty = default_sampling_params.get(
@@ -740,13 +752,17 @@ class CompletionRequest(OpenAIBaseModel):
             default_sampling_params: Optional[dict] = None
     ) -> BeamSearchParams:
         max_tokens = self.max_tokens
-        if max_tokens is None:
-            max_tokens = default_max_tokens
 
         if default_sampling_params is None:
             default_sampling_params = {}
         n = self.n if self.n is not None else 1
 
+        # Use minimum of context window, user request & server limit.
+        max_tokens = min(
+            val for val in (default_max_tokens, max_tokens,
+                            default_sampling_params.get("max_tokens", None))
+            if val is not None)
+
         if (temperature := self.temperature) is None:
             temperature = default_sampling_params.get("temperature", 1.0)
 
@@ -764,11 +780,16 @@ class CompletionRequest(OpenAIBaseModel):
             logits_processor_pattern: Optional[str],
             default_sampling_params: Optional[dict] = None) -> SamplingParams:
         max_tokens = self.max_tokens
-        if max_tokens is None:
-            max_tokens = default_max_tokens
 
         if default_sampling_params is None:
             default_sampling_params = {}
+
+        # Use minimum of context window, user request & server limit.
+        max_tokens = min(
+            val for val in (default_max_tokens, max_tokens,
+                            default_sampling_params.get("max_tokens", None))
+            if val is not None)
+
         # Default parameters
         if (repetition_penalty := self.repetition_penalty) is None:
             repetition_penalty = default_sampling_params.get(
@@ -1000,6 +1021,52 @@ class ScoreRequest(OpenAIBaseModel):
         return PoolingParams(additional_data=self.additional_data)
 
 
+class RerankRequest(OpenAIBaseModel):
+    model: str
+    query: str
+    documents: List[str]
+    top_n: int = Field(default_factory=lambda: 0)
+    truncate_prompt_tokens: Optional[Annotated[int, Field(ge=1)]] = None
+
+    # doc: begin-rerank-pooling-params
+    additional_data: Optional[Any] = None
+    # doc: end-rerank-pooling-params
+
+    # doc: begin-rerank-extra-params
+    priority: int = Field(
+        default=0,
+        description=(
+            "The priority of the request (lower means earlier handling; "
+            "default: 0). Any priority other than 0 will raise an error "
+            "if the served model does not use priority scheduling."))
+
+    # doc: end-rerank-extra-params
+
+    def to_pooling_params(self):
+        return PoolingParams(additional_data=self.additional_data)
+
+
+class RerankDocument(BaseModel):
+    text: str
+
+
+class RerankResult(BaseModel):
+    index: int
+    document: RerankDocument
+    relevance_score: float
+
+
+class RerankUsage(BaseModel):
+    total_tokens: int
+
+
+class RerankResponse(OpenAIBaseModel):
+    id: str
+    model: str
+    usage: RerankUsage
+    results: List[RerankResult]
+
+
 class CompletionLogProbs(OpenAIBaseModel):
     text_offset: List[int] = Field(default_factory=list)
     token_logprobs: List[Optional[float]] = Field(default_factory=list)
@@ -1138,6 +1205,7 @@ class ExtractedToolCallInformation(BaseModel):
 
 class ChatMessage(OpenAIBaseModel):
     role: str
+    reasoning_content: Optional[str] = None
     content: Optional[str] = None
     tool_calls: List[ToolCall] = Field(default_factory=list)
 
@@ -1179,6 +1247,7 @@ class ChatCompletionResponse(OpenAIBaseModel):
 class DeltaMessage(OpenAIBaseModel):
     role: Optional[str] = None
     content: Optional[str] = None
+    reasoning_content: Optional[str] = None
     tool_calls: List[DeltaToolCall] = Field(default_factory=list)
 
 
@@ -1219,7 +1288,21 @@ class BatchRequestInput(OpenAIBaseModel):
     url: str
 
     # The parameters of the request.
-    body: Union[ChatCompletionRequest, EmbeddingRequest]
+    body: Union[ChatCompletionRequest, EmbeddingRequest, ScoreRequest]
+
+    @field_validator('body', mode='plain')
+    @classmethod
+    def check_type_for_url(cls, value: Any, info: ValidationInfo):
+        # Use url to disambiguate models
+        url = info.data['url']
+        if url == "/v1/chat/completions":
+            return ChatCompletionRequest.model_validate(value)
+        if url == "/v1/embeddings":
+            return TypeAdapter(EmbeddingRequest).validate_python(value)
+        if url == "/v1/score":
+            return ScoreRequest.model_validate(value)
+        return TypeAdapter(Union[ChatCompletionRequest, EmbeddingRequest,
+                                 ScoreRequest]).validate_python(value)
 
 
 class BatchResponseData(OpenAIBaseModel):
@@ -1230,7 +1313,8 @@ class BatchResponseData(OpenAIBaseModel):
     request_id: str
 
     # The body of the response.
-    body: Optional[Union[ChatCompletionResponse, EmbeddingResponse]] = None
+    body: Optional[Union[ChatCompletionResponse, EmbeddingResponse,
+                         ScoreResponse]] = None
 
 
 class BatchRequestOutput(OpenAIBaseModel):
diff --git a/vllm/entrypoints/openai/reasoning_parsers/__init__.py b/vllm/entrypoints/openai/reasoning_parsers/__init__.py
new file mode 100644
index 0000000000000..80354d69b50af
--- /dev/null
+++ b/vllm/entrypoints/openai/reasoning_parsers/__init__.py
@@ -0,0 +1,8 @@
+# SPDX-License-Identifier: Apache-2.0
+
+from .abs_reasoning_parsers import ReasoningParser, ReasoningParserManager
+from .deepseek_r1_reasoning_parser import DeepSeekR1ReasoningParser
+
+__all__ = [
+    "ReasoningParser", "ReasoningParserManager", "DeepSeekR1ReasoningParser"
+]
diff --git a/vllm/entrypoints/openai/reasoning_parsers/abs_reasoning_parsers.py b/vllm/entrypoints/openai/reasoning_parsers/abs_reasoning_parsers.py
new file mode 100644
index 0000000000000..b5df7e47446b7
--- /dev/null
+++ b/vllm/entrypoints/openai/reasoning_parsers/abs_reasoning_parsers.py
@@ -0,0 +1,160 @@
+# SPDX-License-Identifier: Apache-2.0
+
+import os
+from functools import cached_property
+from typing import Callable, Dict, List, Optional, Sequence, Tuple, Type, Union
+
+from vllm.entrypoints.openai.protocol import (ChatCompletionRequest,
+                                              DeltaMessage)
+from vllm.logger import init_logger
+from vllm.transformers_utils.tokenizer import AnyTokenizer
+from vllm.utils import import_from_path, is_list_of
+
+logger = init_logger(__name__)
+
+
+class ReasoningParser:
+    """
+    Abstract reasoning parser class that should not be used directly. 
+    Provided and methods should be used in derived classes.
+
+    It is used to extract reasoning content from the model output.
+    """
+
+    def __init__(self, tokenizer: AnyTokenizer):
+        self.model_tokenizer = tokenizer
+
+    @cached_property
+    def vocab(self) -> Dict[str, int]:
+        # NOTE: Only PreTrainedTokenizerFast is guaranteed to have .vocab
+        # whereas all tokenizers have .get_vocab()
+        return self.model_tokenizer.get_vocab()
+
+    def extract_reasoning_content(
+            self, model_output: str, request: ChatCompletionRequest
+    ) -> Tuple[Optional[str], Optional[str]]:
+        """
+        Extract reasoning content from a complete model-generated string.
+
+        Used for non-streaming responses where we have the entire model response
+        available before sending to the client.
+
+        Parameters:
+        model_output: str
+            The model-generated string to extract reasoning content from.
+
+        request: ChatCompletionRequest
+            The request object that was used to generate the model_output.
+
+        Returns:
+        Tuple[Optional[str], Optional[str]]
+            A tuple containing the reasoning content and the content.
+        """
+
+        raise NotImplementedError(
+            "AbstractReasoningParser.extract_reasoning_calls "
+            "has not been implemented!")
+
+    def extract_reasoning_content_streaming(
+        self,
+        previous_text: str,
+        current_text: str,
+        delta_text: str,
+        previous_token_ids: Sequence[int],
+        current_token_ids: Sequence[int],
+        delta_token_ids: Sequence[int],
+    ) -> Union[DeltaMessage, None]:
+        """
+        Instance method that should be implemented for extracting reasoning
+        from an incomplete response; for use when handling reasoning calls and
+        streaming. Has to be an instance method because  it requires state -
+        the current tokens/diffs, but also the information about what has
+        previously been parsed and extracted (see constructor)
+        """
+        raise NotImplementedError(
+            "AbstractReasoningParser.extract_reasoning_content_streaming "
+            "has not been implemented!")
+
+
+class ReasoningParserManager:
+    reasoning_parsers: Dict[str, Type] = {}
+
+    @classmethod
+    def get_reasoning_parser(cls, name) -> Type:
+        """
+        Get reasoning parser by name which is registered by `register_module`.
+
+        Raise a KeyError exception if the name is not registered.
+        """
+        if name in cls.reasoning_parsers:
+            return cls.reasoning_parsers[name]
+
+        raise KeyError(f"reasoning helper: '{name}' not found in "
+                       "reasoning_parsers")
+
+    @classmethod
+    def _register_module(cls,
+                         module: Type,
+                         module_name: Optional[Union[str, List[str]]] = None,
+                         force: bool = True) -> None:
+        if not issubclass(module, ReasoningParser):
+            raise TypeError("module must be subclass of ReasoningParser, "
+                            f"but got {type(module)}")
+        if module_name is None:
+            module_name = module.__name__
+        if isinstance(module_name, str):
+            module_name = [module_name]
+        for name in module_name:
+            if not force and name in cls.reasoning_parsers:
+                existed_module = cls.reasoning_parsers[name]
+                raise KeyError(f"{name} is already registered "
+                               f"at {existed_module.__module__}")
+            cls.reasoning_parsers[name] = module
+
+    @classmethod
+    def register_module(
+            cls,
+            name: Optional[Union[str, List[str]]] = None,
+            force: bool = True,
+            module: Union[Type, None] = None) -> Union[type, Callable]:
+        """
+        Register module with the given name or name list. it can be used as a
+        decoder(with module as None) or normal function(with module as not 
+        None).
+        """
+        if not isinstance(force, bool):
+            raise TypeError(f"force must be a boolean, but got {type(force)}")
+
+        # raise the error ahead of time
+        if not (name is None or isinstance(name, str)
+                or is_list_of(name, str)):
+            raise TypeError(
+                "name must be None, an instance of str, or a sequence of str, "
+                f"but got {type(name)}")
+
+        # use it as a normal method: x.register_module(module=SomeClass)
+        if module is not None:
+            cls._register_module(module=module, module_name=name, force=force)
+            return module
+
+        # use it as a decorator: @x.register_module()
+        def _register(module):
+            cls._register_module(module=module, module_name=name, force=force)
+            return module
+
+        return _register
+
+    @classmethod
+    def import_reasoning_parser(cls, plugin_path: str) -> None:
+        """
+        Import a user-defined reasoning parser by the path 
+        of the reasoning parser define file.
+        """
+        module_name = os.path.splitext(os.path.basename(plugin_path))[0]
+
+        try:
+            import_from_path(module_name, plugin_path)
+        except Exception:
+            logger.exception("Failed to load module '%s' from %s.",
+                             module_name, plugin_path)
+            return
diff --git a/vllm/entrypoints/openai/reasoning_parsers/deepseek_r1_reasoning_parser.py b/vllm/entrypoints/openai/reasoning_parsers/deepseek_r1_reasoning_parser.py
new file mode 100644
index 0000000000000..5c19888d45401
--- /dev/null
+++ b/vllm/entrypoints/openai/reasoning_parsers/deepseek_r1_reasoning_parser.py
@@ -0,0 +1,135 @@
+# SPDX-License-Identifier: Apache-2.0
+
+import re
+from typing import Optional, Sequence, Tuple, Union
+
+from transformers import PreTrainedTokenizerBase
+
+from vllm.entrypoints.openai.protocol import (ChatCompletionRequest,
+                                              DeltaMessage)
+from vllm.entrypoints.openai.reasoning_parsers.abs_reasoning_parsers import (
+    ReasoningParser, ReasoningParserManager)
+from vllm.logger import init_logger
+
+logger = init_logger(__name__)
+
+
+@ReasoningParserManager.register_module("deepseek_r1")
+class DeepSeekR1ReasoningParser(ReasoningParser):
+    """
+    Reasoning parser for DeepSeek R1 model.
+
+    The DeepSeek R1 model uses <think>...</think> tokens to denote reasoning 
+    text. This parser extracts the reasoning content from the model output.
+    """
+
+    def __init__(self, tokenizer: PreTrainedTokenizerBase):
+        super().__init__(tokenizer)
+        self.think_start_token = "<think>"
+        self.think_end_token = "</think>"
+
+        self.reasoning_regex = re.compile(
+            rf"{self.think_start_token}(.*?){self.think_end_token}", re.DOTALL)
+
+        if not self.model_tokenizer:
+            raise ValueError(
+                "The model tokenizer must be passed to the ReasoningParser "
+                "constructor during construction.")
+
+        self.think_start_token_id = self.vocab.get(self.think_start_token)
+        self.think_end_token_id = self.vocab.get(self.think_end_token)
+        if (self.think_start_token_id is None
+                or self.think_end_token_id is None):
+            raise RuntimeError(
+                "DeepSeek R1 reasoning parser could not locate think start/end "
+                "tokens in the tokenizer!")
+
+    def extract_reasoning_content_streaming(
+        self,
+        previous_text: str,
+        current_text: str,
+        delta_text: str,
+        previous_token_ids: Sequence[int],
+        current_token_ids: Sequence[int],
+        delta_token_ids: Sequence[int],
+    ) -> Union[DeltaMessage, None]:
+        """
+        Extract reasoning content from a delta message.
+        Handles streaming output where previous + delta = current.
+        Uses token IDs for faster processing.
+        For text <think>abc</think>xyz:
+        - 'abc' goes to reasoning_content
+        - 'xyz' goes to content
+        """
+        # Skip single special tokens
+        if len(delta_token_ids) == 1 and (delta_token_ids[0] in [
+                self.think_start_token_id, self.think_end_token_id
+        ]):
+            return None
+
+        if self.think_start_token_id in previous_token_ids:
+            if self.think_end_token_id in delta_token_ids:
+                # <think> in previous, </think> in delta,
+                # extract reasoning content
+                end_index = delta_text.find(self.think_end_token)
+                reasoning_content = delta_text[:end_index]
+                content = delta_text[end_index + len(self.think_end_token):]
+                return DeltaMessage(reasoning_content=reasoning_content,
+                                    content=content if content else None)
+            elif self.think_end_token_id in previous_token_ids:
+                # <think> in previous, </think> in previous,
+                # reasoning content continues
+                return DeltaMessage(content=delta_text)
+            else:
+                # <think> in previous, no </think> in previous or delta,
+                # reasoning content continues
+                return DeltaMessage(reasoning_content=delta_text)
+        elif self.think_start_token_id in delta_token_ids:
+            logger.info(delta_text)
+            if self.think_end_token_id in delta_token_ids:
+                # <think> in delta, </think> in delta, extract reasoning content
+                start_index = delta_text.find(self.think_start_token)
+                end_index = delta_text.find(self.think_end_token)
+                reasoning_content = delta_text[start_index +
+                                               len(self.think_start_token
+                                                   ):end_index]
+                content = delta_text[end_index + len(self.think_end_token):]
+                return DeltaMessage(reasoning_content=reasoning_content,
+                                    content=content if content else None)
+            else:
+                # <think> in delta, no </think> in delta,
+                # reasoning content continues
+                return DeltaMessage(reasoning_content=delta_text)
+        else:
+            # No <think> in previous or delta, reasoning content continues.
+            return DeltaMessage(content=delta_text)
+
+    def extract_reasoning_content(
+            self, model_output: str, request: ChatCompletionRequest
+    ) -> Tuple[Optional[str], Optional[str]]:
+
+        # Check if the model output contains the <think> tokens.
+        if (self.think_start_token not in model_output
+                or self.think_end_token not in model_output):
+            return None, model_output
+        else:
+            # Use a regex to find the reasoning content
+            reasoning_content = self.reasoning_regex.findall(model_output)[0]
+
+            # Remove the reasoning content from the model output
+            # Although deepseek's <think> token is always at the
+            # beginning of the line, we cannot guarantee that the
+            # other models will follow this convention.
+            # Therefore, we need to add :start_index.
+            start_index = model_output.find(self.think_start_token)
+            if start_index != -1:
+                end_index = start_index + len(
+                    f"{self.think_start_token}{reasoning_content}{self.think_end_token}"
+                )
+                model_output = model_output[:start_index] + \
+                                model_output[end_index:]
+
+                if len(model_output) == 0:
+                    return reasoning_content, None
+
+            return reasoning_content, model_output
diff --git a/vllm/entrypoints/openai/run_batch.py b/vllm/entrypoints/openai/run_batch.py
index f8f136f9d5024..675d3cdcf9715 100644
--- a/vllm/entrypoints/openai/run_batch.py
+++ b/vllm/entrypoints/openai/run_batch.py
@@ -1,3 +1,5 @@
+# SPDX-License-Identifier: Apache-2.0
+
 import asyncio
 from http import HTTPStatus
 from io import StringIO
@@ -16,12 +18,14 @@ from vllm.entrypoints.openai.protocol import (BatchRequestInput,
                                               BatchRequestOutput,
                                               BatchResponseData,
                                               ChatCompletionResponse,
-                                              EmbeddingResponse, ErrorResponse)
+                                              EmbeddingResponse, ErrorResponse,
+                                              ScoreResponse)
 # yapf: enable
 from vllm.entrypoints.openai.serving_chat import OpenAIServingChat
 from vllm.entrypoints.openai.serving_embedding import OpenAIServingEmbedding
 from vllm.entrypoints.openai.serving_models import (BaseModelPath,
                                                     OpenAIServingModels)
+from vllm.entrypoints.openai.serving_score import OpenAIServingScores
 from vllm.usage.usage_lib import UsageContext
 from vllm.utils import FlexibleArgumentParser, random_uuid
 from vllm.version import __version__ as VLLM_VERSION
@@ -167,7 +171,8 @@ async def run_request(serving_engine_func: Callable,
                       tracker: BatchProgressTracker) -> BatchRequestOutput:
     response = await serving_engine_func(request.body)
 
-    if isinstance(response, (ChatCompletionResponse, EmbeddingResponse)):
+    if isinstance(response,
+                  (ChatCompletionResponse, EmbeddingResponse, ScoreResponse)):
         batch_output = BatchRequestOutput(
             id=f"vllm-{random_uuid()}",
             custom_id=request.custom_id,
@@ -239,6 +244,12 @@ async def main(args):
         chat_template=None,
         chat_template_content_format="auto",
     ) if model_config.task == "embed" else None
+    openai_serving_scores = (OpenAIServingScores(
+        engine,
+        model_config,
+        openai_serving_models,
+        request_logger=request_logger,
+    ) if model_config.task == "score" else None)
 
     tracker = BatchProgressTracker()
     logger.info("Reading batch from %s...", args.input_file)
@@ -279,14 +290,28 @@ async def main(args):
                     ))
                 continue
 
+            response_futures.append(run_request(handler_fn, request, tracker))
+            tracker.submitted()
+        elif request.url == "/v1/score":
+            handler_fn = (None if openai_serving_scores is None else
+                          openai_serving_scores.create_score)
+            if handler_fn is None:
+                response_futures.append(
+                    make_async_error_request_output(
+                        request,
+                        error_msg="The model does not support Scores API",
+                    ))
+                continue
+
             response_futures.append(run_request(handler_fn, request, tracker))
             tracker.submitted()
         else:
             response_futures.append(
                 make_async_error_request_output(
                     request,
-                    error_msg="Only /v1/chat/completions and "
-                    "/v1/embeddings are supported in the batch endpoint.",
+                    error_msg=
+                    "Only /v1/chat/completions, /v1/embeddings, and /v1/score "
+                    "are supported in the batch endpoint.",
                 ))
 
     with tracker.pbar():
diff --git a/vllm/entrypoints/openai/serving_chat.py b/vllm/entrypoints/openai/serving_chat.py
index 89a119ac65695..107220d548afc 100644
--- a/vllm/entrypoints/openai/serving_chat.py
+++ b/vllm/entrypoints/openai/serving_chat.py
@@ -1,3 +1,5 @@
+# SPDX-License-Identifier: Apache-2.0
+
 import asyncio
 import json
 import time
@@ -21,6 +23,8 @@ from vllm.entrypoints.openai.protocol import (
     ChatCompletionStreamResponse, ChatMessage, DeltaFunctionCall, DeltaMessage,
     DeltaToolCall, ErrorResponse, FunctionCall, PromptTokenUsageInfo,
     RequestResponseMetadata, ToolCall, UsageInfo)
+from vllm.entrypoints.openai.reasoning_parsers import (ReasoningParser,
+                                                       ReasoningParserManager)
 from vllm.entrypoints.openai.serving_engine import OpenAIServing
 from vllm.entrypoints.openai.serving_models import OpenAIServingModels
 from vllm.entrypoints.openai.tool_parsers import ToolParser, ToolParserManager
@@ -47,6 +51,8 @@ class OpenAIServingChat(OpenAIServing):
         chat_template: Optional[str],
         chat_template_content_format: ChatTemplateContentFormatOption,
         return_tokens_as_token_ids: bool = False,
+        enable_reasoning: bool = False,
+        reasoning_parser: Optional[str] = None,
         enable_auto_tools: bool = False,
         tool_parser: Optional[str] = None,
         enable_prompt_tokens_details: bool = False,
@@ -69,6 +75,18 @@ class OpenAIServingChat(OpenAIServing):
                 " the parallel_tool_calls client option is preset for "
                 "compatibility reasons, it will be ignored.")
 
+        self.enable_reasoning: bool = enable_reasoning
+        self.reasoning_parser: Optional[Callable[[AnyTokenizer],
+                                                 ReasoningParser]] = None
+        if self.enable_reasoning:
+            try:
+                self.reasoning_parser = (
+                    ReasoningParserManager.get_reasoning_parser(
+                        reasoning_parser))
+            except Exception as e:
+                raise TypeError("Error: --enable-reasoning requires "
+                                f"reasoning_parser:'{reasoning_parser}' "
+                                "which has not been registered") from e
         self.tool_parser: Optional[Callable[[AnyTokenizer], ToolParser]] = None
         if self.enable_auto_tools:
             try:
@@ -285,14 +303,35 @@ class OpenAIServingChat(OpenAIServing):
             not tool_choice_function_name
             and self._should_stream_with_auto_tool_parsing(request))
 
+        should_stream_with_reasoning_parsing = (
+            self._should_stream_with_reasoning_parsing(request))
+
         all_previous_token_ids: Optional[List[List[int]]]
-        if tool_choice_auto:
+
+        # Only one of these will be used, thus previous_texts and
+        # all_previous_token_ids will not be used twice in the same iteration.
+        if tool_choice_auto or should_stream_with_reasoning_parsing:
             # These are only required in "auto" tool choice case
             previous_texts = [""] * num_choices
             all_previous_token_ids = [[]] * num_choices
         else:
             previous_texts, all_previous_token_ids = None, None
 
+        try:
+            # There is no need to check if the reasoning_parser is None
+            # because the should_stream_with_reasoning_parsing check
+            # already ensures that the reasoning_parser is not None.
+            # but the pre-commit hook requires it.
+            if should_stream_with_reasoning_parsing and \
+                self.reasoning_parser is not None:
+                reasoning_parser = self.reasoning_parser(tokenizer)
+        except RuntimeError as e:
+            logger.exception("Error in reasoning parser creation.")
+            data = self.create_streaming_error_response(str(e))
+            yield f"data: {data}\n\n"
+            yield "data: [DONE]\n\n"
+            return
+
         # Prepare the tool parser if it's needed
         try:
             if tool_choice_auto and self.tool_parser:
@@ -456,6 +495,32 @@ class OpenAIServingChat(OpenAIServing):
                         # update the previous values for the next iteration
                         previous_texts[i] = current_text
                         all_previous_token_ids[i] = current_token_ids
+                    # reasoning_content cannot be enabled with tool_choice.
+                    # If it is, the tool_choice will be used instead.
+                    elif self.enable_reasoning:
+                        # handle reasoning_content delta
+                        assert reasoning_parser is not None
+                        assert previous_texts is not None
+                        assert all_previous_token_ids is not None
+                        previous_text = previous_texts[i]
+                        previous_token_ids = all_previous_token_ids[i]
+                        current_text = previous_text + delta_text
+                        current_token_ids = previous_token_ids + list(
+                            output.token_ids)
+
+                        delta_message = (reasoning_parser.
+                                         extract_reasoning_content_streaming(
+                                             previous_text,
+                                             current_text,
+                                             delta_text,
+                                             previous_token_ids,
+                                             current_token_ids,
+                                             output.token_ids,
+                                         ))
+
+                        # update the previous values for the next iteration
+                        previous_texts[i] = current_text
+                        all_previous_token_ids[i] = current_token_ids
 
                     # handle streaming just a content delta
                     else:
@@ -642,17 +707,38 @@ class OpenAIServingChat(OpenAIServing):
             else:
                 logprobs = None
 
+            should_stream_with_reasoning_parsing = (
+                self._should_stream_with_reasoning_parsing(request))
+
             # In the OpenAI API the finish_reason is "tools_called"
             # if the tool choice is auto and the model produced a tool
             # call. The same is not true for named function calls
             auto_tools_called = False
 
+            if should_stream_with_reasoning_parsing and \
+                self.reasoning_parser is not None:
+                try:
+                    reasoning_parser = self.reasoning_parser(tokenizer)
+                except RuntimeError as e:
+                    logger.exception("Error in reasoning parser creation.")
+                    return self.create_error_response(str(e))
+
+                reasoning_content, content = (
+                    reasoning_parser.extract_reasoning_content(
+                        output.text, request=request))
+
+                if reasoning_content:
+                    message = ChatMessage(role=role,
+                                          content=content,
+                                          reasoning_content=reasoning_content)
+                else:
+                    message = ChatMessage(role=role, content=output.text)
+
             # if auto tools are not enabled, and a named tool choice using
             #   outlines is not being used
-            if (not self.enable_auto_tools
-                    or not self.tool_parser) and not isinstance(
-                        request.tool_choice,
-                        ChatCompletionNamedToolChoiceParam):
+            elif (not self.enable_auto_tools
+                  or not self.tool_parser) and not isinstance(
+                      request.tool_choice, ChatCompletionNamedToolChoiceParam):
                 message = ChatMessage(role=role, content=output.text)
 
             # if the request uses tools and specified a tool choice
@@ -835,6 +921,17 @@ class OpenAIServingChat(OpenAIServing):
         return (request.tools and self.tool_parser and self.enable_auto_tools
                 and request.tool_choice in ['auto', None])
 
+    def _should_stream_with_reasoning_parsing(self,
+                                              request: ChatCompletionRequest):
+        """
+            Utility function to check if streamed tokens should go through the
+            reasoning parser that was configured.
+    
+            We only want to do this IF reasoning is enabled and a reasoning 
+            parser is configured.
+            """
+        return self.enable_reasoning and self.reasoning_parser is not None
+
     def _should_check_for_unstreamed_tool_arg_tokens(
         self,
         delta_message: Optional[DeltaMessage],
diff --git a/vllm/entrypoints/openai/serving_completion.py b/vllm/entrypoints/openai/serving_completion.py
index 2c9c20caf8119..e7ad263e7fbe5 100644
--- a/vllm/entrypoints/openai/serving_completion.py
+++ b/vllm/entrypoints/openai/serving_completion.py
@@ -1,3 +1,5 @@
+# SPDX-License-Identifier: Apache-2.0
+
 import asyncio
 import time
 from typing import AsyncGenerator, AsyncIterator, Dict, List, Optional
@@ -58,7 +60,7 @@ class OpenAIServingCompletion(OpenAIServing):
     async def create_completion(
         self,
         request: CompletionRequest,
-        raw_request: Request,
+        raw_request: Optional[Request] = None,
     ) -> Union[AsyncGenerator[str, None], CompletionResponse, ErrorResponse]:
         """Completion API similar to OpenAI's API.
 
@@ -137,7 +139,7 @@ class OpenAIServingCompletion(OpenAIServing):
                                  lora_request=lora_request,
                                  prompt_adapter_request=prompt_adapter_request)
 
-                trace_headers = (await
+                trace_headers = (None if raw_request is None else await
                                  self._get_trace_headers(raw_request.headers))
 
                 if isinstance(sampling_params, BeamSearchParams):
@@ -522,11 +524,10 @@ class OpenAIServingCompletion(OpenAIServing):
                 out_top_logprobs.append({
                     # Convert float("-inf") to the
                     # JSON-serializable float that OpenAI uses
-                    self._get_decoded_token(
-                        top_lp[1],
-                        top_lp[0],
-                        tokenizer,
-                        return_as_token_id=self.return_tokens_as_token_ids):
+                    self._get_decoded_token(top_lp[1],
+                                            top_lp[0],
+                                            tokenizer,
+                                            return_as_token_id=self.return_tokens_as_token_ids):
                     max(top_lp[1].logprob, -9999.0)
                     for i, top_lp in enumerate(step_top_logprobs.items())
                     if num_output_top_logprobs >= i
diff --git a/vllm/entrypoints/openai/serving_embedding.py b/vllm/entrypoints/openai/serving_embedding.py
index e7116a3d95d10..45f8ad90ddcb3 100644
--- a/vllm/entrypoints/openai/serving_embedding.py
+++ b/vllm/entrypoints/openai/serving_embedding.py
@@ -1,3 +1,5 @@
+# SPDX-License-Identifier: Apache-2.0
+
 import asyncio
 import base64
 import time
diff --git a/vllm/entrypoints/openai/serving_engine.py b/vllm/entrypoints/openai/serving_engine.py
index 3da447be06430..8d39fdcb74833 100644
--- a/vllm/entrypoints/openai/serving_engine.py
+++ b/vllm/entrypoints/openai/serving_engine.py
@@ -1,3 +1,5 @@
+# SPDX-License-Identifier: Apache-2.0
+
 import json
 from concurrent.futures.thread import ThreadPoolExecutor
 from http import HTTPStatus
@@ -26,7 +28,8 @@ from vllm.entrypoints.openai.protocol import (ChatCompletionRequest,
                                               DetokenizeRequest,
                                               EmbeddingChatRequest,
                                               EmbeddingCompletionRequest,
-                                              ErrorResponse, ScoreRequest,
+                                              ErrorResponse, RerankRequest,
+                                              ScoreRequest,
                                               TokenizeChatRequest,
                                               TokenizeCompletionRequest)
 from vllm.entrypoints.openai.serving_models import OpenAIServingModels
@@ -204,9 +207,9 @@ class OpenAIServing:
         token_num = len(input_ids)
 
         # Note: EmbeddingRequest and ScoreRequest doesn't have max_tokens
-        if isinstance(
-                request,
-            (EmbeddingChatRequest, EmbeddingCompletionRequest, ScoreRequest)):
+        if isinstance(request,
+                      (EmbeddingChatRequest, EmbeddingCompletionRequest,
+                       ScoreRequest, RerankRequest)):
 
             operation = "score" if isinstance(request, ScoreRequest) \
                 else "embedding generation"
diff --git a/vllm/entrypoints/openai/serving_models.py b/vllm/entrypoints/openai/serving_models.py
index fc422f0917bd5..f917a48519016 100644
--- a/vllm/entrypoints/openai/serving_models.py
+++ b/vllm/entrypoints/openai/serving_models.py
@@ -1,3 +1,5 @@
+# SPDX-License-Identifier: Apache-2.0
+
 import json
 import pathlib
 from dataclasses import dataclass
@@ -203,7 +205,7 @@ class OpenAIServingModels:
                for lora_request in self.lora_requests):
             return create_error_response(
                 message=
-                f"The lora adapter '{request.lora_name}' has already been"
+                f"The lora adapter '{request.lora_name}' has already been "
                 "loaded.",
                 err_type="InvalidUserInput",
                 status_code=HTTPStatus.BAD_REQUEST)
diff --git a/vllm/entrypoints/openai/serving_pooling.py b/vllm/entrypoints/openai/serving_pooling.py
index 5830322071e58..01a3d211f6ba6 100644
--- a/vllm/entrypoints/openai/serving_pooling.py
+++ b/vllm/entrypoints/openai/serving_pooling.py
@@ -1,3 +1,5 @@
+# SPDX-License-Identifier: Apache-2.0
+
 import asyncio
 import base64
 import time
diff --git a/vllm/entrypoints/openai/serving_rerank.py b/vllm/entrypoints/openai/serving_rerank.py
new file mode 100644
index 0000000000000..366df71217e91
--- /dev/null
+++ b/vllm/entrypoints/openai/serving_rerank.py
@@ -0,0 +1,208 @@
+# SPDX-License-Identifier: Apache-2.0
+
+import asyncio
+from typing import Any, AsyncGenerator, Dict, List, Optional, Union, cast
+
+from fastapi import Request
+
+from vllm.config import ModelConfig
+from vllm.engine.protocol import EngineClient
+from vllm.entrypoints.logger import RequestLogger
+from vllm.entrypoints.openai.protocol import (ErrorResponse, RerankDocument,
+                                              RerankRequest, RerankResponse,
+                                              RerankResult, RerankUsage)
+from vllm.entrypoints.openai.serving_engine import OpenAIServing
+from vllm.entrypoints.openai.serving_models import OpenAIServingModels
+from vllm.inputs.data import TokensPrompt
+from vllm.logger import init_logger
+from vllm.outputs import PoolingRequestOutput, ScoringRequestOutput
+from vllm.transformers_utils.tokenizers.mistral import MistralTokenizer
+from vllm.utils import make_async, merge_async_iterators
+
+logger = init_logger(__name__)
+
+
+class JinaAIServingRerank(OpenAIServing):
+
+    def __init__(
+        self,
+        engine_client: EngineClient,
+        model_config: ModelConfig,
+        models: OpenAIServingModels,
+        *,
+        request_logger: Optional[RequestLogger],
+    ) -> None:
+        super().__init__(engine_client=engine_client,
+                         model_config=model_config,
+                         models=models,
+                         request_logger=request_logger)
+
+    async def do_rerank(
+        self,
+        request: RerankRequest,
+        raw_request: Optional[Request] = None
+    ) -> Union[RerankResponse, ErrorResponse]:
+        """
+        Rerank API based on JinaAI's rerank API; implements the same
+        API interface. Designed for compatibility with off-the-shelf
+        tooling, since this is a common standard for reranking APIs
+
+        See example client implementations at
+        https://github.com/infiniflow/ragflow/blob/main/rag/llm/rerank_model.py
+        numerous clients use this standard.
+        """
+        error_check_ret = await self._check_model(request)
+        if error_check_ret is not None:
+            return error_check_ret
+
+        model_name = request.model
+        request_id = f"rerank-{self._base_request_id(raw_request)}"
+        truncate_prompt_tokens = request.truncate_prompt_tokens
+        query = request.query
+        documents = request.documents
+        request_prompts = []
+        engine_prompts = []
+        top_n = request.top_n if request.top_n > 0 else len(documents)
+
+        try:
+            (
+                lora_request,
+                prompt_adapter_request,
+            ) = self._maybe_get_adapters(request)
+
+            tokenizer = await self.engine_client.get_tokenizer(lora_request)
+
+            if prompt_adapter_request is not None:
+                raise NotImplementedError("Prompt adapter is not supported "
+                                          "for scoring models")
+
+            if isinstance(tokenizer, MistralTokenizer):
+                raise ValueError(
+                    "MistralTokenizer not supported for cross-encoding")
+
+            if not self.model_config.is_cross_encoder:
+                raise ValueError("Model is not cross encoder.")
+
+            if truncate_prompt_tokens is not None and \
+                    truncate_prompt_tokens > self.max_model_len:
+                raise ValueError(
+                    f"truncate_prompt_tokens value ({truncate_prompt_tokens}) "
+                    f"is greater than max_model_len ({self.max_model_len})."
+                    f" Please, select a smaller truncation size.")
+            for doc in documents:
+                request_prompt = f"{query}{tokenizer.sep_token}{doc}"
+                tokenization_kwargs: Dict[str, Any] = {}
+                if truncate_prompt_tokens is not None:
+                    tokenization_kwargs["truncation"] = True
+                    tokenization_kwargs["max_length"] = truncate_prompt_tokens
+
+                tokenize_async = make_async(tokenizer.__call__,
+                                            executor=self._tokenizer_executor)
+                prompt_inputs = await tokenize_async(text=query,
+                                                     text_pair=doc,
+                                                     **tokenization_kwargs)
+
+                input_ids = prompt_inputs["input_ids"]
+                text_token_prompt = \
+                    self._validate_input(request, input_ids, request_prompt)
+                engine_prompt = TokensPrompt(
+                    prompt_token_ids=text_token_prompt["prompt_token_ids"],
+                    token_type_ids=prompt_inputs.get("token_type_ids"))
+
+                request_prompts.append(request_prompt)
+                engine_prompts.append(engine_prompt)
+
+        except ValueError as e:
+            logger.exception("Error in preprocessing prompt inputs")
+            return self.create_error_response(str(e))
+
+        # Schedule the request and get the result generator.
+        generators: List[AsyncGenerator[PoolingRequestOutput, None]] = []
+
+        try:
+            pooling_params = request.to_pooling_params()
+
+            for i, engine_prompt in enumerate(engine_prompts):
+                request_id_item = f"{request_id}-{i}"
+
+                self._log_inputs(request_id_item,
+                                 request_prompts[i],
+                                 params=pooling_params,
+                                 lora_request=lora_request,
+                                 prompt_adapter_request=prompt_adapter_request)
+
+                trace_headers = (None if raw_request is None else await
+                                 self._get_trace_headers(raw_request.headers))
+
+                generator = self.engine_client.encode(
+                    engine_prompt,
+                    pooling_params,
+                    request_id_item,
+                    lora_request=lora_request,
+                    trace_headers=trace_headers,
+                    priority=request.priority,
+                )
+
+                generators.append(generator)
+        except ValueError as e:
+            # TODO: Use a vllm-specific Validation Error
+            return self.create_error_response(str(e))
+        result_generator = merge_async_iterators(*generators)
+
+        num_prompts = len(engine_prompts)
+
+        # Non-streaming response
+        final_res_batch: List[Optional[PoolingRequestOutput]]
+        final_res_batch = [None] * num_prompts
+
+        try:
+            async for i, res in result_generator:
+                final_res_batch[i] = res
+
+            assert all(final_res is not None for final_res in final_res_batch)
+
+            final_res_batch_checked = cast(List[PoolingRequestOutput],
+                                           final_res_batch)
+
+            response = self.request_output_to_rerank_response(
+                final_res_batch_checked, request_id, model_name, documents,
+                top_n)
+        except asyncio.CancelledError:
+            return self.create_error_response("Client disconnected")
+        except ValueError as e:
+            # TODO: Use a vllm-specific Validation Error
+            return self.create_error_response(str(e))
+
+        return response
+
+    def request_output_to_rerank_response(
+            self, final_res_batch: List[PoolingRequestOutput], request_id: str,
+            model_name: str, documents: List[str],
+            top_n: int) -> RerankResponse:
+        """
+        Convert the output of do_rank to a RerankResponse
+        """
+        results: List[RerankResult] = []
+        num_prompt_tokens = 0
+        for idx, final_res in enumerate(final_res_batch):
+            classify_res = ScoringRequestOutput.from_base(final_res)
+
+            result = RerankResult(
+                index=idx,
+                document=RerankDocument(text=documents[idx]),
+                relevance_score=classify_res.outputs.score,
+            )
+            results.append(result)
+            prompt_token_ids = final_res.prompt_token_ids
+            num_prompt_tokens += len(prompt_token_ids)
+
+        # sort by relevance, then return the top n if set
+        results.sort(key=lambda x: x.relevance_score, reverse=True)
+        if top_n < len(documents):
+            results = results[:top_n]
+
+        return RerankResponse(
+            id=request_id,
+            model=model_name,
+            results=results,
+            usage=RerankUsage(total_tokens=num_prompt_tokens))
diff --git a/vllm/entrypoints/openai/serving_score.py b/vllm/entrypoints/openai/serving_score.py
index 381edf8fac49e..832aa8516cc35 100644
--- a/vllm/entrypoints/openai/serving_score.py
+++ b/vllm/entrypoints/openai/serving_score.py
@@ -1,3 +1,5 @@
+# SPDX-License-Identifier: Apache-2.0
+
 import asyncio
 import time
 from typing import Any, AsyncGenerator, Dict, List, Optional, Union, cast
diff --git a/vllm/entrypoints/openai/serving_tokenization.py b/vllm/entrypoints/openai/serving_tokenization.py
index b67ecfb01316f..6c79adf90c8ad 100644
--- a/vllm/entrypoints/openai/serving_tokenization.py
+++ b/vllm/entrypoints/openai/serving_tokenization.py
@@ -1,3 +1,5 @@
+# SPDX-License-Identifier: Apache-2.0
+
 from typing import Final, List, Optional, Union
 
 from fastapi import Request
diff --git a/vllm/entrypoints/openai/tool_parsers/__init__.py b/vllm/entrypoints/openai/tool_parsers/__init__.py
index 2850349a44835..d1c3afa64b96c 100644
--- a/vllm/entrypoints/openai/tool_parsers/__init__.py
+++ b/vllm/entrypoints/openai/tool_parsers/__init__.py
@@ -1,3 +1,5 @@
+# SPDX-License-Identifier: Apache-2.0
+
 from .abstract_tool_parser import ToolParser, ToolParserManager
 from .granite_20b_fc_tool_parser import Granite20bFCToolParser
 from .granite_tool_parser import GraniteToolParser
diff --git a/vllm/entrypoints/openai/tool_parsers/abstract_tool_parser.py b/vllm/entrypoints/openai/tool_parsers/abstract_tool_parser.py
index aa7c201098935..7cdd6d4c4f2ba 100644
--- a/vllm/entrypoints/openai/tool_parsers/abstract_tool_parser.py
+++ b/vllm/entrypoints/openai/tool_parsers/abstract_tool_parser.py
@@ -1,3 +1,5 @@
+# SPDX-License-Identifier: Apache-2.0
+
 import os
 from functools import cached_property
 from typing import Callable, Dict, List, Optional, Sequence, Type, Union
diff --git a/vllm/entrypoints/openai/tool_parsers/granite_20b_fc_tool_parser.py b/vllm/entrypoints/openai/tool_parsers/granite_20b_fc_tool_parser.py
index 94db8f379e33a..002bf17388308 100644
--- a/vllm/entrypoints/openai/tool_parsers/granite_20b_fc_tool_parser.py
+++ b/vllm/entrypoints/openai/tool_parsers/granite_20b_fc_tool_parser.py
@@ -1,3 +1,5 @@
+# SPDX-License-Identifier: Apache-2.0
+
 import json
 import re
 from json import JSONDecoder
@@ -62,8 +64,8 @@ class Granite20bFCToolParser(ToolParser):
                 start_of_json = match.end()
                 # end_index == the start of the next function call
                 # (if exists)
-                next_function_call_start = (matches[i + 1].start()
-                                            if i + 1 < len(matches) else None)
+                next_function_call_start = (matches[i + 1].start() if i +
+                                            1 < len(matches) else None)
 
                 raw_function_calls.append(
                     dec.raw_decode(
diff --git a/vllm/entrypoints/openai/tool_parsers/granite_tool_parser.py b/vllm/entrypoints/openai/tool_parsers/granite_tool_parser.py
index 8aefcd8d58a39..c948ed78f503b 100644
--- a/vllm/entrypoints/openai/tool_parsers/granite_tool_parser.py
+++ b/vllm/entrypoints/openai/tool_parsers/granite_tool_parser.py
@@ -1,3 +1,5 @@
+# SPDX-License-Identifier: Apache-2.0
+
 import json
 from typing import Dict, Sequence, Union
 
diff --git a/vllm/entrypoints/openai/tool_parsers/hermes_tool_parser.py b/vllm/entrypoints/openai/tool_parsers/hermes_tool_parser.py
index 869d15ac359ea..4841b28703ee3 100644
--- a/vllm/entrypoints/openai/tool_parsers/hermes_tool_parser.py
+++ b/vllm/entrypoints/openai/tool_parsers/hermes_tool_parser.py
@@ -1,3 +1,5 @@
+# SPDX-License-Identifier: Apache-2.0
+
 import json
 import re
 from typing import Dict, List, Sequence, Union
diff --git a/vllm/entrypoints/openai/tool_parsers/internlm2_tool_parser.py b/vllm/entrypoints/openai/tool_parsers/internlm2_tool_parser.py
index cb391e11bbde2..b9215e7979bf5 100644
--- a/vllm/entrypoints/openai/tool_parsers/internlm2_tool_parser.py
+++ b/vllm/entrypoints/openai/tool_parsers/internlm2_tool_parser.py
@@ -1,3 +1,5 @@
+# SPDX-License-Identifier: Apache-2.0
+
 import json
 from typing import Dict, Sequence, Union
 
diff --git a/vllm/entrypoints/openai/tool_parsers/jamba_tool_parser.py b/vllm/entrypoints/openai/tool_parsers/jamba_tool_parser.py
index cfd024853f887..7c4d63e188653 100644
--- a/vllm/entrypoints/openai/tool_parsers/jamba_tool_parser.py
+++ b/vllm/entrypoints/openai/tool_parsers/jamba_tool_parser.py
@@ -1,3 +1,5 @@
+# SPDX-License-Identifier: Apache-2.0
+
 import json
 import re
 from typing import Dict, List, Sequence, Union
diff --git a/vllm/entrypoints/openai/tool_parsers/llama_tool_parser.py b/vllm/entrypoints/openai/tool_parsers/llama_tool_parser.py
index 1856308b88cfa..6a7b113623e65 100644
--- a/vllm/entrypoints/openai/tool_parsers/llama_tool_parser.py
+++ b/vllm/entrypoints/openai/tool_parsers/llama_tool_parser.py
@@ -1,3 +1,5 @@
+# SPDX-License-Identifier: Apache-2.0
+
 import json
 import re
 from json import JSONDecoder
diff --git a/vllm/entrypoints/openai/tool_parsers/mistral_tool_parser.py b/vllm/entrypoints/openai/tool_parsers/mistral_tool_parser.py
index bada805dd35b9..51354f7c95623 100644
--- a/vllm/entrypoints/openai/tool_parsers/mistral_tool_parser.py
+++ b/vllm/entrypoints/openai/tool_parsers/mistral_tool_parser.py
@@ -1,3 +1,5 @@
+# SPDX-License-Identifier: Apache-2.0
+
 import json
 import re
 from random import choices
diff --git a/vllm/entrypoints/openai/tool_parsers/pythonic_tool_parser.py b/vllm/entrypoints/openai/tool_parsers/pythonic_tool_parser.py
index 26da4d689fb8b..5c282b5c2605a 100644
--- a/vllm/entrypoints/openai/tool_parsers/pythonic_tool_parser.py
+++ b/vllm/entrypoints/openai/tool_parsers/pythonic_tool_parser.py
@@ -1,3 +1,5 @@
+# SPDX-License-Identifier: Apache-2.0
+
 import ast
 import json
 import re
diff --git a/vllm/entrypoints/openai/tool_parsers/utils.py b/vllm/entrypoints/openai/tool_parsers/utils.py
index 5e4eb23bfaf43..945cbd6835028 100644
--- a/vllm/entrypoints/openai/tool_parsers/utils.py
+++ b/vllm/entrypoints/openai/tool_parsers/utils.py
@@ -1,3 +1,5 @@
+# SPDX-License-Identifier: Apache-2.0
+
 import json
 from json import JSONDecodeError, JSONDecoder
 from typing import Any, List, Tuple
diff --git a/vllm/entrypoints/utils.py b/vllm/entrypoints/utils.py
index e8a78d216d0f0..9af37871d57c8 100644
--- a/vllm/entrypoints/utils.py
+++ b/vllm/entrypoints/utils.py
@@ -1,3 +1,5 @@
+# SPDX-License-Identifier: Apache-2.0
+
 import asyncio
 import functools
 
diff --git a/vllm/envs.py b/vllm/envs.py
index 8627caec7790d..5018f6deb7f4f 100644
--- a/vllm/envs.py
+++ b/vllm/envs.py
@@ -1,3 +1,5 @@
+# SPDX-License-Identifier: Apache-2.0
+
 import os
 import tempfile
 from typing import TYPE_CHECKING, Any, Callable, Dict, List, Optional
@@ -77,6 +79,10 @@ if TYPE_CHECKING:
     V_SCALE_CONSTANT: int = 100
     VLLM_SERVER_DEV_MODE: bool = False
     VLLM_V1_OUTPUT_PROC_CHUNK_SIZE: int = 128
+    VLLM_MLA_DISABLE: bool = False
+    VLLM_MLA_PERFORM_MATRIX_ABSORPTION: bool = True
+    VLLM_MLA_DISABLE_REQUANTIZATION: bool = False
+    VLLM_ENABLE_MOE_ALIGN_BLOCK_SIZE_TRITON: bool = False
 
 
 def get_default_cache_root():
@@ -506,6 +512,33 @@ environment_variables: Dict[str, Callable[[], Any]] = {
     # TTFT and overall throughput.
     "VLLM_V1_OUTPUT_PROC_CHUNK_SIZE":
     lambda: int(os.getenv("VLLM_V1_OUTPUT_PROC_CHUNK_SIZE", "128")),
+
+    # If set, vLLM will disable the MLA attention optimizations.
+    "VLLM_MLA_DISABLE":
+    lambda: bool(int(os.getenv("VLLM_MLA_DISABLE", "0"))),
+
+    # Flag that can control whether or not we perform matrix-absorption for MLA
+    # decode, i.e. absorb W_UK into W_Q/W_UK and W_UV into W_O, absorbing the
+    # matrices reduces the runtime FLOPs needed to compute MLA but requires
+    # storing more weights, W_Q_UK and W_UV_O, so can increase memory usage,
+    # the is enabled by default
+    "VLLM_MLA_PERFORM_MATRIX_ABSORPTION":
+    lambda: bool(int(os.getenv("VLLM_MLA_PERFORM_MATRIX_ABSORPTION", "1"))),
+
+    # When running MLA with matrix-absorption enabled and fp8 quantized weights
+    # we perform the matrix-absorption in float32 precision, after the matrices
+    # are absorbed we requantize the weights back to fp8, this flag can be used
+    # to disable the requantization step, and instead convert the absorbed
+    # matrices to match the activation type. This can lead to higher memory and
+    # compute usage but better preserves the accuracy of the original model.
+    "VLLM_MLA_DISABLE_REQUANTIZATION":
+    lambda: bool(int(os.getenv("VLLM_MLA_DISABLE_REQUANTIZATION", "0"))),
+
+    # If set, vLLM will use the Triton implementation of moe_align_block_size,
+    # i.e. moe_align_block_size_triton in fused_moe.py.
+    "VLLM_ENABLE_MOE_ALIGN_BLOCK_SIZE_TRITON":
+    lambda: bool(int(os.getenv("VLLM_ENABLE_MOE_ALIGN_BLOCK_SIZE_TRITON", "0"))
+                 ),
 }
 
 # end-env-vars-definition
diff --git a/vllm/executor/executor_base.py b/vllm/executor/executor_base.py
index 471d1bfac3119..fb76276bb4b34 100644
--- a/vllm/executor/executor_base.py
+++ b/vllm/executor/executor_base.py
@@ -1,3 +1,5 @@
+# SPDX-License-Identifier: Apache-2.0
+
 import asyncio
 from abc import ABC, abstractmethod
 from typing import (Any, Awaitable, Callable, Dict, List, Optional, Set, Tuple,
diff --git a/vllm/executor/mp_distributed_executor.py b/vllm/executor/mp_distributed_executor.py
index 78c86321d861d..d1f8c36fbbec7 100644
--- a/vllm/executor/mp_distributed_executor.py
+++ b/vllm/executor/mp_distributed_executor.py
@@ -1,3 +1,5 @@
+# SPDX-License-Identifier: Apache-2.0
+
 import asyncio
 import os
 from typing import Any, Callable, List, Optional, Union
diff --git a/vllm/executor/msgspec_utils.py b/vllm/executor/msgspec_utils.py
index c467115f124ca..e680d53cbd10e 100644
--- a/vllm/executor/msgspec_utils.py
+++ b/vllm/executor/msgspec_utils.py
@@ -1,3 +1,5 @@
+# SPDX-License-Identifier: Apache-2.0
+
 from array import array
 from typing import Any, Type
 
diff --git a/vllm/executor/multiproc_worker_utils.py b/vllm/executor/multiproc_worker_utils.py
index 539b6ae2d3572..cef6a994a9c09 100644
--- a/vllm/executor/multiproc_worker_utils.py
+++ b/vllm/executor/multiproc_worker_utils.py
@@ -1,3 +1,5 @@
+# SPDX-License-Identifier: Apache-2.0
+
 import asyncio
 import os
 import sys
diff --git a/vllm/executor/ray_distributed_executor.py b/vllm/executor/ray_distributed_executor.py
index 2afd99f99b353..80e7a1c405f9f 100644
--- a/vllm/executor/ray_distributed_executor.py
+++ b/vllm/executor/ray_distributed_executor.py
@@ -1,3 +1,5 @@
+# SPDX-License-Identifier: Apache-2.0
+
 import asyncio
 import os
 from collections import defaultdict
diff --git a/vllm/executor/ray_utils.py b/vllm/executor/ray_utils.py
index e55155ea06225..7b30155971a6d 100644
--- a/vllm/executor/ray_utils.py
+++ b/vllm/executor/ray_utils.py
@@ -1,3 +1,5 @@
+# SPDX-License-Identifier: Apache-2.0
+
 import os
 import time
 from collections import defaultdict
@@ -212,7 +214,10 @@ def _wait_until_pg_ready(current_placement_group: "PlacementGroup"):
         logger.info(
             "Waiting for creating a placement group of specs for "
             "%d seconds. specs=%s. Check "
-            "`ray status` to see if you have enough resources.",
+            "`ray status` to see if you have enough resources,"
+            " and make sure the IP addresses used by ray cluster"
+            " are the same as VLLM_HOST_IP environment variable"
+            " specified in each node if you are running on a multi-node.",
             int(time.time() - s), placement_group_specs)
 
     try:
diff --git a/vllm/executor/uniproc_executor.py b/vllm/executor/uniproc_executor.py
index a5c4dcf0ec7f9..dcb4a8f27c252 100644
--- a/vllm/executor/uniproc_executor.py
+++ b/vllm/executor/uniproc_executor.py
@@ -1,3 +1,5 @@
+# SPDX-License-Identifier: Apache-2.0
+
 import os
 from typing import Any, Callable, Dict, List, Optional, Tuple, Union
 
diff --git a/vllm/forward_context.py b/vllm/forward_context.py
index 828b394ec5d21..10de8bc593ab8 100644
--- a/vllm/forward_context.py
+++ b/vllm/forward_context.py
@@ -1,3 +1,5 @@
+# SPDX-License-Identifier: Apache-2.0
+
 import time
 from collections import defaultdict
 from contextlib import contextmanager
diff --git a/vllm/inputs/__init__.py b/vllm/inputs/__init__.py
index a0dd89f69bacd..6f8f2cd758f7b 100644
--- a/vllm/inputs/__init__.py
+++ b/vllm/inputs/__init__.py
@@ -1,3 +1,5 @@
+# SPDX-License-Identifier: Apache-2.0
+
 from .data import (DecoderOnlyInputs, EncoderDecoderInputs,
                    ExplicitEncoderDecoderPrompt, ProcessorInputs, PromptType,
                    SingletonInputs, SingletonInputsAdapter, SingletonPrompt,
diff --git a/vllm/inputs/data.py b/vllm/inputs/data.py
index 57e85779dd587..2ffebeee392a3 100644
--- a/vllm/inputs/data.py
+++ b/vllm/inputs/data.py
@@ -1,3 +1,5 @@
+# SPDX-License-Identifier: Apache-2.0
+
 from dataclasses import dataclass
 from functools import cached_property
 from typing import (TYPE_CHECKING, Any, Dict, Generic, Iterable, List, Literal,
diff --git a/vllm/inputs/parse.py b/vllm/inputs/parse.py
index 09f1ff2cb42e9..454d9d8303b77 100644
--- a/vllm/inputs/parse.py
+++ b/vllm/inputs/parse.py
@@ -1,3 +1,5 @@
+# SPDX-License-Identifier: Apache-2.0
+
 from typing import List, Literal, Sequence, TypedDict, Union, cast, overload
 
 from typing_extensions import TypeIs
diff --git a/vllm/inputs/preprocess.py b/vllm/inputs/preprocess.py
index 70372e0cad22d..4d8f28cb04175 100644
--- a/vllm/inputs/preprocess.py
+++ b/vllm/inputs/preprocess.py
@@ -1,3 +1,5 @@
+# SPDX-License-Identifier: Apache-2.0
+
 import asyncio
 from typing import List, Mapping, Optional, Union
 
diff --git a/vllm/inputs/registry.py b/vllm/inputs/registry.py
index 4b73ade7af5f0..0ec726b8b05f7 100644
--- a/vllm/inputs/registry.py
+++ b/vllm/inputs/registry.py
@@ -1,3 +1,5 @@
+# SPDX-License-Identifier: Apache-2.0
+
 import functools
 from collections import UserDict
 from dataclasses import dataclass
diff --git a/vllm/logger.py b/vllm/logger.py
index cac174f7ba02a..b20d55e3c1019 100644
--- a/vllm/logger.py
+++ b/vllm/logger.py
@@ -1,3 +1,4 @@
+# SPDX-License-Identifier: Apache-2.0
 """Logging configuration for vLLM."""
 import datetime
 import json
diff --git a/vllm/logging_utils/__init__.py b/vllm/logging_utils/__init__.py
index 576ccf78a8117..7ab4632589bf4 100644
--- a/vllm/logging_utils/__init__.py
+++ b/vllm/logging_utils/__init__.py
@@ -1,3 +1,5 @@
+# SPDX-License-Identifier: Apache-2.0
+
 from vllm.logging_utils.formatter import NewLineFormatter
 
 __all__ = [
diff --git a/vllm/logging_utils/formatter.py b/vllm/logging_utils/formatter.py
index b24b4e11d1fcb..010b0a124987b 100644
--- a/vllm/logging_utils/formatter.py
+++ b/vllm/logging_utils/formatter.py
@@ -1,3 +1,5 @@
+# SPDX-License-Identifier: Apache-2.0
+
 import logging
 
 
diff --git a/vllm/logits_process.py b/vllm/logits_process.py
index 7716ccd27e253..d02072e8f8189 100644
--- a/vllm/logits_process.py
+++ b/vllm/logits_process.py
@@ -1,3 +1,5 @@
+# SPDX-License-Identifier: Apache-2.0
+
 from typing import Callable, List, Tuple, Union
 
 import torch
diff --git a/vllm/lora/fully_sharded_layers.py b/vllm/lora/fully_sharded_layers.py
index 545ec21ca74c1..3d6620817b4bb 100644
--- a/vllm/lora/fully_sharded_layers.py
+++ b/vllm/lora/fully_sharded_layers.py
@@ -1,3 +1,5 @@
+# SPDX-License-Identifier: Apache-2.0
+
 # pylint: disable=unused-argument
 from typing import TYPE_CHECKING, List, Optional, Tuple, Union, cast
 
diff --git a/vllm/lora/layers.py b/vllm/lora/layers.py
index e6f26d2b74b2f..9f0297596ccbf 100644
--- a/vllm/lora/layers.py
+++ b/vllm/lora/layers.py
@@ -1,3 +1,5 @@
+# SPDX-License-Identifier: Apache-2.0
+
 # pylint: disable=unused-argument
 import math
 from dataclasses import dataclass
@@ -220,8 +222,10 @@ class VocabParallelEmbeddingWithLoRA(BaseLayerWithLoRA):
                                 lora_b.T, non_blocking=True)
         if embeddings_tensor is not None:
             self.embeddings_tensors[
-                index, :embeddings_tensor.shape[0], :embeddings_tensor.
-                shape[1], ].copy_(embeddings_tensor, non_blocking=True)
+                index,
+                :embeddings_tensor.shape[0],
+                :embeddings_tensor.shape[1],
+            ].copy_(embeddings_tensor, non_blocking=True)
             if self.embeddings_slice is not None:
                 # TODO(yard1): Optimize this copy, we don't need to copy
                 # everything, just the modified part
@@ -1024,8 +1028,10 @@ class LogitsProcessorWithLoRA(BaseLayerWithLoRA):
                                 lora_b.T, non_blocking=True)
         if embeddings_tensor is not None:
             self.embeddings_tensors[
-                index, :embeddings_tensor.shape[0], :embeddings_tensor.
-                shape[1], ] = embeddings_tensor
+                index,
+                :embeddings_tensor.shape[0],
+                :embeddings_tensor.shape[1],
+            ] = embeddings_tensor
 
     def _get_logits(
         self,
diff --git a/vllm/lora/lora.py b/vllm/lora/lora.py
index 93ad4651f4b77..00299bf6c2a81 100644
--- a/vllm/lora/lora.py
+++ b/vllm/lora/lora.py
@@ -1,3 +1,5 @@
+# SPDX-License-Identifier: Apache-2.0
+
 from typing import List, Optional
 from typing import Sequence as GenericSequence
 
diff --git a/vllm/lora/models.py b/vllm/lora/models.py
index b77b6b3d72ff4..ef77fd4b74cec 100644
--- a/vllm/lora/models.py
+++ b/vllm/lora/models.py
@@ -1,3 +1,5 @@
+# SPDX-License-Identifier: Apache-2.0
+
 import copy
 import math
 import os
@@ -75,8 +77,9 @@ class LoRAModel(AdapterModel):
         # Scaling factor for long context lora model. None if it is not
         # fine tuned for the long context.
         self.scaling_factor = scaling_factor
-        assert (lora_model_id >
-                0), f"a valid lora id should be greater than 0, got {self.id}"
+        assert (
+            lora_model_id
+            > 0), f"a valid lora id should be greater than 0, got {self.id}"
         self.rank = rank
         self.loras: Dict[str, LoRALayerWeights] = loras
 
diff --git a/vllm/lora/ops/torch_ops/__init__.py b/vllm/lora/ops/torch_ops/__init__.py
index 9c9159b95f308..85601d58c9d73 100644
--- a/vllm/lora/ops/torch_ops/__init__.py
+++ b/vllm/lora/ops/torch_ops/__init__.py
@@ -1,3 +1,5 @@
+# SPDX-License-Identifier: Apache-2.0
+
 from vllm.lora.ops.torch_ops.lora_ops import bgmv_expand  # noqa: F401
 from vllm.lora.ops.torch_ops.lora_ops import (bgmv_expand_slice, bgmv_shrink,
                                               sgmv_expand, sgmv_expand_slice,
diff --git a/vllm/lora/ops/torch_ops/lora_ops.py b/vllm/lora/ops/torch_ops/lora_ops.py
index 5f5aafd516159..af79f98415cbc 100644
--- a/vllm/lora/ops/torch_ops/lora_ops.py
+++ b/vllm/lora/ops/torch_ops/lora_ops.py
@@ -1,3 +1,5 @@
+# SPDX-License-Identifier: Apache-2.0
+
 import torch
 
 
diff --git a/vllm/lora/ops/triton_ops/__init__.py b/vllm/lora/ops/triton_ops/__init__.py
index 9805b6dd5038e..dc440f7327fa4 100644
--- a/vllm/lora/ops/triton_ops/__init__.py
+++ b/vllm/lora/ops/triton_ops/__init__.py
@@ -1,3 +1,5 @@
+# SPDX-License-Identifier: Apache-2.0
+
 from vllm.lora.ops.triton_ops.bgmv_expand import bgmv_expand
 from vllm.lora.ops.triton_ops.bgmv_expand_slice import bgmv_expand_slice
 from vllm.lora.ops.triton_ops.bgmv_shrink import bgmv_shrink
diff --git a/vllm/lora/ops/triton_ops/bgmv_expand.py b/vllm/lora/ops/triton_ops/bgmv_expand.py
index 42adb191b8ead..98510b39661a6 100644
--- a/vllm/lora/ops/triton_ops/bgmv_expand.py
+++ b/vllm/lora/ops/triton_ops/bgmv_expand.py
@@ -1,3 +1,4 @@
+# SPDX-License-Identifier: Apache-2.0
 """
 Based on:
 Chen, L., Ye, Z., Wu, Y., Zhuo, D., Ceze, L., & Krishnamurthy, A. (2023). 
diff --git a/vllm/lora/ops/triton_ops/bgmv_expand_slice.py b/vllm/lora/ops/triton_ops/bgmv_expand_slice.py
index f397d752a3ea9..48804123c1eae 100644
--- a/vllm/lora/ops/triton_ops/bgmv_expand_slice.py
+++ b/vllm/lora/ops/triton_ops/bgmv_expand_slice.py
@@ -1,3 +1,4 @@
+# SPDX-License-Identifier: Apache-2.0
 """
 Based on:
 Chen, L., Ye, Z., Wu, Y., Zhuo, D., Ceze, L., & Krishnamurthy, A. (2023). 
diff --git a/vllm/lora/ops/triton_ops/bgmv_shrink.py b/vllm/lora/ops/triton_ops/bgmv_shrink.py
index f3ef01d39e776..227a5765e56be 100644
--- a/vllm/lora/ops/triton_ops/bgmv_shrink.py
+++ b/vllm/lora/ops/triton_ops/bgmv_shrink.py
@@ -1,3 +1,4 @@
+# SPDX-License-Identifier: Apache-2.0
 """
 Based on:
 Chen, L., Ye, Z., Wu, Y., Zhuo, D., Ceze, L., & Krishnamurthy, A. (2023). 
diff --git a/vllm/lora/ops/triton_ops/sgmv_expand.py b/vllm/lora/ops/triton_ops/sgmv_expand.py
index 8af44b703810b..a8e71cacfe5a2 100644
--- a/vllm/lora/ops/triton_ops/sgmv_expand.py
+++ b/vllm/lora/ops/triton_ops/sgmv_expand.py
@@ -1,3 +1,4 @@
+# SPDX-License-Identifier: Apache-2.0
 """
 Based on:
 Chen, L., Ye, Z., Wu, Y., Zhuo, D., Ceze, L., & Krishnamurthy, A. (2023).
@@ -136,9 +137,8 @@ def _sgmv_expand_kernel(
     c_ptr = (out_ptr + offset_cm[:, None] * output_d0_stride +
              offset_cn[None, :] * output_d1_stride)
     M = tl.load(seq_lens + cur_batch)
-    c_mask = (offset_cm[:, None] <
-              (cur_seq_start + M)) & (offset_cn[None, :] <
-                                      (cur_slice_start + curr_N))
+    c_mask = (offset_cm[:, None] < (cur_seq_start + M)) & (
+        offset_cn[None, :] < (cur_slice_start + curr_N))
     if ADD_INPUTS:
         tiled_out = tl.load(c_ptr, mask=c_mask)
         tiled_c += tiled_out
diff --git a/vllm/lora/ops/triton_ops/sgmv_shrink.py b/vllm/lora/ops/triton_ops/sgmv_shrink.py
index 3d2ebe8286f56..8b26583c11c14 100644
--- a/vllm/lora/ops/triton_ops/sgmv_shrink.py
+++ b/vllm/lora/ops/triton_ops/sgmv_shrink.py
@@ -1,3 +1,4 @@
+# SPDX-License-Identifier: Apache-2.0
 """
 Based on:
 Chen, L., Ye, Z., Wu, Y., Zhuo, D., Ceze, L., & Krishnamurthy, A. (2023). 
@@ -114,8 +115,8 @@ def _sgmv_shrink_kernel(
                    slice_id * output_d0_stride)
     c_ptr = cur_out_ptr + offset_cm[:, None] * output_d1_stride + offset_cn[
         None, :] * output_d2_stride
-    c_mask = (offset_cm[:, None] <
-              (cur_seq_start + M)) & (offset_cn[None, :] < N)
+    c_mask = (offset_cm[:, None] < (cur_seq_start + M)) & (offset_cn[None, :]
+                                                           < N)
     accumulator *= scaling
     # handles write-back with reduction-splitting
     if SPLIT_K == 1:
diff --git a/vllm/lora/ops/triton_ops/utils.py b/vllm/lora/ops/triton_ops/utils.py
index 7df5bc2c225e5..78409b91a14e8 100644
--- a/vllm/lora/ops/triton_ops/utils.py
+++ b/vllm/lora/ops/triton_ops/utils.py
@@ -1,3 +1,5 @@
+# SPDX-License-Identifier: Apache-2.0
+
 import functools
 from typing import Dict, List, Tuple
 
diff --git a/vllm/lora/peft_helper.py b/vllm/lora/peft_helper.py
index b9c506f6e0bfd..9496ab5a75c07 100644
--- a/vllm/lora/peft_helper.py
+++ b/vllm/lora/peft_helper.py
@@ -1,3 +1,5 @@
+# SPDX-License-Identifier: Apache-2.0
+
 # Adapted from: https://github.com/huggingface/peft/blob/main/src/peft/tuners/lora/config.py
 
 import json
diff --git a/vllm/lora/punica_wrapper/__init__.py b/vllm/lora/punica_wrapper/__init__.py
index 48ada3926ea46..915fc6623398e 100644
--- a/vllm/lora/punica_wrapper/__init__.py
+++ b/vllm/lora/punica_wrapper/__init__.py
@@ -1,3 +1,5 @@
+# SPDX-License-Identifier: Apache-2.0
+
 from vllm.lora.punica_wrapper.punica_base import PunicaWrapperBase
 from vllm.lora.punica_wrapper.punica_selector import get_punica_wrapper
 
diff --git a/vllm/lora/punica_wrapper/punica_base.py b/vllm/lora/punica_wrapper/punica_base.py
index b9ec0c4bc6323..1a2282ae9accd 100644
--- a/vllm/lora/punica_wrapper/punica_base.py
+++ b/vllm/lora/punica_wrapper/punica_base.py
@@ -1,3 +1,4 @@
+# SPDX-License-Identifier: Apache-2.0
 """
 Based on:
 Chen, L., Ye, Z., Wu, Y., Zhuo, D., Ceze, L., & Krishnamurthy, A. (2023). 
diff --git a/vllm/lora/punica_wrapper/punica_cpu.py b/vllm/lora/punica_wrapper/punica_cpu.py
index b9ae3e07492c0..29428f4cfff31 100644
--- a/vllm/lora/punica_wrapper/punica_cpu.py
+++ b/vllm/lora/punica_wrapper/punica_cpu.py
@@ -1,3 +1,5 @@
+# SPDX-License-Identifier: Apache-2.0
+
 from typing import Callable, Optional, Tuple, Union
 
 import torch
diff --git a/vllm/lora/punica_wrapper/punica_gpu.py b/vllm/lora/punica_wrapper/punica_gpu.py
index 451f23e49f27c..9ccd9c36a073e 100644
--- a/vllm/lora/punica_wrapper/punica_gpu.py
+++ b/vllm/lora/punica_wrapper/punica_gpu.py
@@ -1,3 +1,4 @@
+# SPDX-License-Identifier: Apache-2.0
 """
 Based on:
 Chen, L., Ye, Z., Wu, Y., Zhuo, D., Ceze, L., & Krishnamurthy, A. (2023). 
diff --git a/vllm/lora/punica_wrapper/punica_hpu.py b/vllm/lora/punica_wrapper/punica_hpu.py
index d9c4f44a1c282..51e1bfab3f513 100644
--- a/vllm/lora/punica_wrapper/punica_hpu.py
+++ b/vllm/lora/punica_wrapper/punica_hpu.py
@@ -1,3 +1,5 @@
+# SPDX-License-Identifier: Apache-2.0
+
 from typing import Optional, Tuple, Union, final
 
 import torch
diff --git a/vllm/lora/punica_wrapper/punica_selector.py b/vllm/lora/punica_wrapper/punica_selector.py
index a293224651992..ad5d4b788ec43 100644
--- a/vllm/lora/punica_wrapper/punica_selector.py
+++ b/vllm/lora/punica_wrapper/punica_selector.py
@@ -1,3 +1,5 @@
+# SPDX-License-Identifier: Apache-2.0
+
 from vllm.logger import init_logger
 from vllm.platforms import current_platform
 from vllm.utils import resolve_obj_by_qualname
diff --git a/vllm/lora/punica_wrapper/utils.py b/vllm/lora/punica_wrapper/utils.py
index 7360c8c09e3ac..dbc2d27c597f2 100644
--- a/vllm/lora/punica_wrapper/utils.py
+++ b/vllm/lora/punica_wrapper/utils.py
@@ -1,3 +1,5 @@
+# SPDX-License-Identifier: Apache-2.0
+
 from typing import TYPE_CHECKING, List, Optional, Tuple, Union
 
 import torch
diff --git a/vllm/lora/request.py b/vllm/lora/request.py
index c4b26dc92c6f4..badfaa4193774 100644
--- a/vllm/lora/request.py
+++ b/vllm/lora/request.py
@@ -1,3 +1,5 @@
+# SPDX-License-Identifier: Apache-2.0
+
 import warnings
 from typing import Optional
 
@@ -31,7 +33,7 @@ class LoRARequest(
     base_model_name: Optional[str] = msgspec.field(default=None)
 
     def __post_init__(self):
-        if 'lora_local_path' in self.__struct_fields__:
+        if self.lora_local_path:
             warnings.warn(
                 "The 'lora_local_path' attribute is deprecated "
                 "and will be removed in a future version. "
diff --git a/vllm/lora/utils.py b/vllm/lora/utils.py
index d72b7638d84af..f47b0af155226 100644
--- a/vllm/lora/utils.py
+++ b/vllm/lora/utils.py
@@ -1,3 +1,5 @@
+# SPDX-License-Identifier: Apache-2.0
+
 import os
 import re
 from typing import List, Optional, Set, Tuple, Type, Union
diff --git a/vllm/lora/worker_manager.py b/vllm/lora/worker_manager.py
index a64296f7fd902..f33a7b88cc35f 100644
--- a/vllm/lora/worker_manager.py
+++ b/vllm/lora/worker_manager.py
@@ -1,3 +1,5 @@
+# SPDX-License-Identifier: Apache-2.0
+
 from contextlib import contextmanager
 from typing import Any, Dict, List, Literal, Optional, Set, Type, Union
 
diff --git a/vllm/model_executor/__init__.py b/vllm/model_executor/__init__.py
index 7278c7fbe8bea..7636152176f13 100644
--- a/vllm/model_executor/__init__.py
+++ b/vllm/model_executor/__init__.py
@@ -1,3 +1,5 @@
+# SPDX-License-Identifier: Apache-2.0
+
 from vllm.model_executor.parameter import (BasevLLMParameter,
                                            PackedvLLMParameter)
 from vllm.model_executor.sampling_metadata import (SamplingMetadata,
diff --git a/vllm/model_executor/custom_op.py b/vllm/model_executor/custom_op.py
index 96995c56bf504..ee4f41ea6ec90 100644
--- a/vllm/model_executor/custom_op.py
+++ b/vllm/model_executor/custom_op.py
@@ -1,3 +1,5 @@
+# SPDX-License-Identifier: Apache-2.0
+
 from typing import Dict, Type
 
 import torch.nn as nn
diff --git a/vllm/model_executor/guided_decoding/__init__.py b/vllm/model_executor/guided_decoding/__init__.py
index 18b435a42544a..cf96461a549f3 100644
--- a/vllm/model_executor/guided_decoding/__init__.py
+++ b/vllm/model_executor/guided_decoding/__init__.py
@@ -1,3 +1,5 @@
+# SPDX-License-Identifier: Apache-2.0
+
 from __future__ import annotations
 
 from typing import TYPE_CHECKING
diff --git a/vllm/model_executor/guided_decoding/guided_fields.py b/vllm/model_executor/guided_decoding/guided_fields.py
index 8deb4c949824a..db4ce26806c1f 100644
--- a/vllm/model_executor/guided_decoding/guided_fields.py
+++ b/vllm/model_executor/guided_decoding/guided_fields.py
@@ -1,3 +1,5 @@
+# SPDX-License-Identifier: Apache-2.0
+
 from dataclasses import dataclass
 from typing import Dict, List, Optional, TypedDict, Union
 
diff --git a/vllm/model_executor/guided_decoding/lm_format_enforcer_decoding.py b/vllm/model_executor/guided_decoding/lm_format_enforcer_decoding.py
index a17e75a80300f..7eaf9e38e66a3 100644
--- a/vllm/model_executor/guided_decoding/lm_format_enforcer_decoding.py
+++ b/vllm/model_executor/guided_decoding/lm_format_enforcer_decoding.py
@@ -1,3 +1,5 @@
+# SPDX-License-Identifier: Apache-2.0
+
 from functools import lru_cache
 from json import loads as json_loads
 from typing import Optional, Union
diff --git a/vllm/model_executor/guided_decoding/outlines_decoding.py b/vllm/model_executor/guided_decoding/outlines_decoding.py
index eb8db882435e6..ba9c982903682 100644
--- a/vllm/model_executor/guided_decoding/outlines_decoding.py
+++ b/vllm/model_executor/guided_decoding/outlines_decoding.py
@@ -1,3 +1,5 @@
+# SPDX-License-Identifier: Apache-2.0
+
 import asyncio
 import concurrent.futures
 import os
diff --git a/vllm/model_executor/guided_decoding/outlines_logits_processors.py b/vllm/model_executor/guided_decoding/outlines_logits_processors.py
index e4eb3f16e56cf..ab72b55a89435 100644
--- a/vllm/model_executor/guided_decoding/outlines_logits_processors.py
+++ b/vllm/model_executor/guided_decoding/outlines_logits_processors.py
@@ -1,3 +1,5 @@
+# SPDX-License-Identifier: Apache-2.0
+
 # Copyright 2024- the Outlines developers
 # This file is adapted from
 # https://github.com/outlines-dev/outlines/blob/main/outlines/serve/vllm.py
diff --git a/vllm/model_executor/guided_decoding/utils.py b/vllm/model_executor/guided_decoding/utils.py
index 90dfa62ec4670..87ef453584577 100644
--- a/vllm/model_executor/guided_decoding/utils.py
+++ b/vllm/model_executor/guided_decoding/utils.py
@@ -1,3 +1,5 @@
+# SPDX-License-Identifier: Apache-2.0
+
 import re
 
 
diff --git a/vllm/model_executor/guided_decoding/xgrammar_decoding.py b/vllm/model_executor/guided_decoding/xgrammar_decoding.py
index 2d8594cb8aafa..c01bd3af1d5b9 100644
--- a/vllm/model_executor/guided_decoding/xgrammar_decoding.py
+++ b/vllm/model_executor/guided_decoding/xgrammar_decoding.py
@@ -1,3 +1,5 @@
+# SPDX-License-Identifier: Apache-2.0
+
 # noqa: UP007
 from __future__ import annotations
 
@@ -307,8 +309,8 @@ class XGrammarLogitsProcessor:
         # Note: In this method, if the tensors have different dimensions
         # on CPU device fails, but on GPU it runs without error. Hence the
         # unsqueeze above for scores, to match the token bitmask shape
-        xgr.apply_token_bitmask_inplace(scores,
-                                        self.token_bitmask.to(scores.device))
+        xgr.apply_token_bitmask_inplace(
+            scores, self.token_bitmask.to(scores.device, non_blocking=True))
         if device_type != "cuda":
             scores = scores.to(dtype).to(device_type).squeeze()
 
diff --git a/vllm/model_executor/layers/activation.py b/vllm/model_executor/layers/activation.py
index fb9684ac1c184..f782920d06a02 100644
--- a/vllm/model_executor/layers/activation.py
+++ b/vllm/model_executor/layers/activation.py
@@ -1,3 +1,4 @@
+# SPDX-License-Identifier: Apache-2.0
 """Custom activation functions."""
 import math
 from typing import Optional
diff --git a/vllm/model_executor/layers/fused_moe/__init__.py b/vllm/model_executor/layers/fused_moe/__init__.py
index c4223d12600ac..6f933c3fa3c9f 100644
--- a/vllm/model_executor/layers/fused_moe/__init__.py
+++ b/vllm/model_executor/layers/fused_moe/__init__.py
@@ -1,3 +1,5 @@
+# SPDX-License-Identifier: Apache-2.0
+
 from contextlib import contextmanager
 from typing import Any, Dict, Optional
 
diff --git a/vllm/model_executor/layers/fused_moe/configs/E=256,N=128,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json b/vllm/model_executor/layers/fused_moe/configs/E=256,N=128,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json
new file mode 100644
index 0000000000000..2e692a1583a4a
--- /dev/null
+++ b/vllm/model_executor/layers/fused_moe/configs/E=256,N=128,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json	
@@ -0,0 +1,146 @@
+{
+    "1": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "2": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "4": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "8": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "16": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "24": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "32": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "48": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "64": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "96": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "128": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "256": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "512": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "1024": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "1536": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "2048": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "3072": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "4096": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 3
+    }
+}
diff --git a/vllm/model_executor/layers/fused_moe/configs/E=256,N=256,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json b/vllm/model_executor/layers/fused_moe/configs/E=256,N=256,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json
new file mode 100644
index 0000000000000..6fcf408755f5d
--- /dev/null
+++ b/vllm/model_executor/layers/fused_moe/configs/E=256,N=256,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json	
@@ -0,0 +1,146 @@
+{
+    "1": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "2": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "4": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "8": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "16": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "24": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "32": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "48": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "64": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "96": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "128": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "256": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "512": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "1024": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "1536": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "2048": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "3072": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "4096": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 3
+    }
+}
diff --git a/vllm/model_executor/layers/fused_moe/configs/E=8,N=14336,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8.json b/vllm/model_executor/layers/fused_moe/configs/E=8,N=14336,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8.json
new file mode 100644
index 0000000000000..b6f1d01f88652
--- /dev/null
+++ b/vllm/model_executor/layers/fused_moe/configs/E=8,N=14336,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8.json
@@ -0,0 +1,164 @@
+{
+    "1": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "2": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 16,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 2,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "4": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "8": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 2,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "16": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 2,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "24": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 2,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "32": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 4,
+        "num_warps": 2,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "48": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 4,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "64": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 4,
+        "num_warps": 2,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "96": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 2,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "128": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 4,
+        "num_warps": 8,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "256": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 4,
+        "num_warps": 8,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "512": {
+        "BLOCK_SIZE_M": 256,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 4,
+        "num_warps": 8,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "1024": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "1536": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "2048": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "3072": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "4096": {
+        "BLOCK_SIZE_M": 256,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    }
+}
diff --git a/vllm/model_executor/layers/fused_moe/configs/E=8,N=14336,device_name=AMD_Instinct_MI300X.json b/vllm/model_executor/layers/fused_moe/configs/E=8,N=14336,device_name=AMD_Instinct_MI300X.json
index 4d4b752fa5d64..66f9106bd1be3 100644
--- a/vllm/model_executor/layers/fused_moe/configs/E=8,N=14336,device_name=AMD_Instinct_MI300X.json
+++ b/vllm/model_executor/layers/fused_moe/configs/E=8,N=14336,device_name=AMD_Instinct_MI300X.json
@@ -1,21 +1,21 @@
 {
     "1": {
         "BLOCK_SIZE_M": 16,
-        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_N": 16,
         "BLOCK_SIZE_K": 256,
         "GROUP_SIZE_M": 1,
         "num_warps": 2,
         "num_stages": 2,
         "waves_per_eu": 0,
         "matrix_instr_nonkdim": 16,
-        "kpack": 1
+        "kpack": 2
     },
     "2": {
         "BLOCK_SIZE_M": 16,
         "BLOCK_SIZE_N": 16,
-        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_K": 256,
         "GROUP_SIZE_M": 1,
-        "num_warps": 2,
+        "num_warps": 4,
         "num_stages": 2,
         "waves_per_eu": 0,
         "matrix_instr_nonkdim": 16,
@@ -23,10 +23,10 @@
     },
     "4": {
         "BLOCK_SIZE_M": 16,
-        "BLOCK_SIZE_N": 32,
-        "BLOCK_SIZE_K": 256,
+        "BLOCK_SIZE_N": 16,
+        "BLOCK_SIZE_K": 128,
         "GROUP_SIZE_M": 1,
-        "num_warps": 2,
+        "num_warps": 1,
         "num_stages": 2,
         "waves_per_eu": 0,
         "matrix_instr_nonkdim": 16,
@@ -34,10 +34,10 @@
     },
     "8": {
         "BLOCK_SIZE_M": 16,
-        "BLOCK_SIZE_N": 16,
-        "BLOCK_SIZE_K": 256,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 64,
         "GROUP_SIZE_M": 1,
-        "num_warps": 1,
+        "num_warps": 2,
         "num_stages": 2,
         "waves_per_eu": 0,
         "matrix_instr_nonkdim": 16,
@@ -48,7 +48,7 @@
         "BLOCK_SIZE_N": 16,
         "BLOCK_SIZE_K": 256,
         "GROUP_SIZE_M": 1,
-        "num_warps": 4,
+        "num_warps": 2,
         "num_stages": 2,
         "waves_per_eu": 0,
         "matrix_instr_nonkdim": 16,
@@ -56,10 +56,10 @@
     },
     "24": {
         "BLOCK_SIZE_M": 16,
-        "BLOCK_SIZE_N": 32,
-        "BLOCK_SIZE_K": 64,
+        "BLOCK_SIZE_N": 16,
+        "BLOCK_SIZE_K": 256,
         "GROUP_SIZE_M": 1,
-        "num_warps": 1,
+        "num_warps": 2,
         "num_stages": 2,
         "waves_per_eu": 0,
         "matrix_instr_nonkdim": 16,
@@ -67,57 +67,57 @@
     },
     "32": {
         "BLOCK_SIZE_M": 16,
-        "BLOCK_SIZE_N": 16,
-        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 256,
         "GROUP_SIZE_M": 4,
         "num_warps": 2,
         "num_stages": 2,
         "waves_per_eu": 0,
         "matrix_instr_nonkdim": 16,
+        "kpack": 2
+    },
+    "48": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 4,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0,
+        "matrix_instr_nonkdim": 16,
         "kpack": 1
     },
-    "48": {
-        "BLOCK_SIZE_M": 16,
-        "BLOCK_SIZE_N": 16,
-        "BLOCK_SIZE_K": 128,
-        "GROUP_SIZE_M": 4,
-        "num_warps": 2,
-        "num_stages": 2,
-        "waves_per_eu": 0,
-        "matrix_instr_nonkdim": 16,
-        "kpack": 2
-    },
     "64": {
         "BLOCK_SIZE_M": 32,
         "BLOCK_SIZE_N": 64,
         "BLOCK_SIZE_K": 128,
         "GROUP_SIZE_M": 4,
-        "num_warps": 8,
-        "num_stages": 2,
-        "waves_per_eu": 0,
-        "matrix_instr_nonkdim": 16,
-        "kpack": 2
-    },
-    "96": {
-        "BLOCK_SIZE_M": 32,
-        "BLOCK_SIZE_N": 32,
-        "BLOCK_SIZE_K": 128,
-        "GROUP_SIZE_M": 4,
         "num_warps": 4,
         "num_stages": 2,
         "waves_per_eu": 0,
         "matrix_instr_nonkdim": 16,
         "kpack": 2
     },
-    "128": {
-        "BLOCK_SIZE_M": 64,
+    "96": {
+        "BLOCK_SIZE_M": 32,
         "BLOCK_SIZE_N": 64,
-        "BLOCK_SIZE_K": 64,
+        "BLOCK_SIZE_K": 256,
         "GROUP_SIZE_M": 4,
         "num_warps": 8,
         "num_stages": 2,
         "waves_per_eu": 0,
         "matrix_instr_nonkdim": 16,
+        "kpack": 1
+    },
+    "128": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 4,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0,
+        "matrix_instr_nonkdim": 32,
         "kpack": 2
     },
     "256": {
@@ -129,7 +129,7 @@
         "num_stages": 2,
         "waves_per_eu": 0,
         "matrix_instr_nonkdim": 16,
-        "kpack": 1
+        "kpack": 2
     },
     "512": {
         "BLOCK_SIZE_M": 128,
@@ -150,7 +150,7 @@
         "num_warps": 8,
         "num_stages": 2,
         "waves_per_eu": 0,
-        "matrix_instr_nonkdim": 32,
+        "matrix_instr_nonkdim": 16,
         "kpack": 2
     },
     "1536": {
@@ -184,7 +184,7 @@
         "num_stages": 2,
         "waves_per_eu": 0,
         "matrix_instr_nonkdim": 16,
-        "kpack": 1
+        "kpack": 2
     },
     "4096": {
         "BLOCK_SIZE_M": 128,
@@ -195,6 +195,6 @@
         "num_stages": 2,
         "waves_per_eu": 0,
         "matrix_instr_nonkdim": 16,
-        "kpack": 1
+        "kpack": 2
     }
 }
diff --git a/vllm/model_executor/layers/fused_moe/configs/E=8,N=16384,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8.json b/vllm/model_executor/layers/fused_moe/configs/E=8,N=16384,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8.json
new file mode 100644
index 0000000000000..0e5fd1eec77d7
--- /dev/null
+++ b/vllm/model_executor/layers/fused_moe/configs/E=8,N=16384,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8.json
@@ -0,0 +1,164 @@
+{
+    "1": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 16,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 2,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "2": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 2,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "4": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 2,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "8": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 2,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "16": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 2,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "24": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 2,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "32": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 4,
+        "num_warps": 2,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "48": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "64": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 4,
+        "num_warps": 2,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "96": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 2,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "128": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 4,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "256": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "512": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "1024": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "1536": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "2048": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "3072": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "4096": {
+        "BLOCK_SIZE_M": 256,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    }
+}
diff --git a/vllm/model_executor/layers/fused_moe/configs/E=8,N=16384,device_name=AMD_Instinct_MI300X.json b/vllm/model_executor/layers/fused_moe/configs/E=8,N=16384,device_name=AMD_Instinct_MI300X.json
new file mode 100644
index 0000000000000..d6ad63509f157
--- /dev/null
+++ b/vllm/model_executor/layers/fused_moe/configs/E=8,N=16384,device_name=AMD_Instinct_MI300X.json
@@ -0,0 +1,200 @@
+{
+    "1": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 16,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 2,
+        "num_stages": 2,
+        "waves_per_eu": 0,
+        "matrix_instr_nonkdim": 16,
+        "kpack": 2
+    },
+    "2": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 2,
+        "num_stages": 2,
+        "waves_per_eu": 0,
+        "matrix_instr_nonkdim": 16,
+        "kpack": 2
+    },
+    "4": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 2,
+        "num_stages": 2,
+        "waves_per_eu": 0,
+        "matrix_instr_nonkdim": 16,
+        "kpack": 2
+    },
+    "8": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 2,
+        "num_stages": 2,
+        "waves_per_eu": 0,
+        "matrix_instr_nonkdim": 16,
+        "kpack": 2
+    },
+    "16": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 16,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 1,
+        "num_stages": 2,
+        "waves_per_eu": 0,
+        "matrix_instr_nonkdim": 16,
+        "kpack": 2
+    },
+    "24": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 2,
+        "num_stages": 2,
+        "waves_per_eu": 0,
+        "matrix_instr_nonkdim": 16,
+        "kpack": 2
+    },
+    "32": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 4,
+        "num_warps": 2,
+        "num_stages": 2,
+        "waves_per_eu": 0,
+        "matrix_instr_nonkdim": 16,
+        "kpack": 2
+    },
+    "48": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0,
+        "matrix_instr_nonkdim": 16,
+        "kpack": 2
+    },
+    "64": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 4,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0,
+        "matrix_instr_nonkdim": 16,
+        "kpack": 2
+    },
+    "96": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 4,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0,
+        "matrix_instr_nonkdim": 16,
+        "kpack": 1
+    },
+    "128": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 4,
+        "num_warps": 8,
+        "num_stages": 2,
+        "waves_per_eu": 0,
+        "matrix_instr_nonkdim": 16,
+        "kpack": 2
+    },
+    "256": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 4,
+        "num_warps": 8,
+        "num_stages": 2,
+        "waves_per_eu": 0,
+        "matrix_instr_nonkdim": 16,
+        "kpack": 2
+    },
+    "512": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 2,
+        "waves_per_eu": 0,
+        "matrix_instr_nonkdim": 16,
+        "kpack": 2
+    },
+    "1024": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 4,
+        "num_warps": 8,
+        "num_stages": 2,
+        "waves_per_eu": 0,
+        "matrix_instr_nonkdim": 16,
+        "kpack": 2
+    },
+    "1536": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 4,
+        "num_warps": 8,
+        "num_stages": 2,
+        "waves_per_eu": 0,
+        "matrix_instr_nonkdim": 16,
+        "kpack": 2
+    },
+    "2048": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 4,
+        "num_warps": 8,
+        "num_stages": 2,
+        "waves_per_eu": 0,
+        "matrix_instr_nonkdim": 16,
+        "kpack": 2
+    },
+    "3072": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 8,
+        "num_stages": 2,
+        "waves_per_eu": 0,
+        "matrix_instr_nonkdim": 16,
+        "kpack": 2
+    },
+    "4096": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 8,
+        "num_stages": 2,
+        "waves_per_eu": 0,
+        "matrix_instr_nonkdim": 16,
+        "kpack": 2
+    }
+}
diff --git a/vllm/model_executor/layers/fused_moe/configs/E=8,N=1792,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8.json b/vllm/model_executor/layers/fused_moe/configs/E=8,N=1792,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8.json
new file mode 100644
index 0000000000000..8323f512db015
--- /dev/null
+++ b/vllm/model_executor/layers/fused_moe/configs/E=8,N=1792,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8.json
@@ -0,0 +1,164 @@
+{
+    "1": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "2": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "4": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "8": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "16": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "24": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "32": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 4,
+        "num_warps": 8,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "48": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "64": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 4,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "96": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "128": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 4,
+        "num_warps": 8,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "256": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 4,
+        "num_warps": 8,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "512": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "1024": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "1536": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "2048": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "3072": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "4096": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 4,
+        "num_warps": 8,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    }
+}
diff --git a/vllm/model_executor/layers/fused_moe/configs/E=8,N=1792,device_name=AMD_Instinct_MI300X.json b/vllm/model_executor/layers/fused_moe/configs/E=8,N=1792,device_name=AMD_Instinct_MI300X.json
index a218fc40642c1..1b46cb5716514 100644
--- a/vllm/model_executor/layers/fused_moe/configs/E=8,N=1792,device_name=AMD_Instinct_MI300X.json
+++ b/vllm/model_executor/layers/fused_moe/configs/E=8,N=1792,device_name=AMD_Instinct_MI300X.json
@@ -1,10 +1,10 @@
 {
     "1": {
         "BLOCK_SIZE_M": 16,
-        "BLOCK_SIZE_N": 32,
-        "BLOCK_SIZE_K": 256,
+        "BLOCK_SIZE_N": 16,
+        "BLOCK_SIZE_K": 128,
         "GROUP_SIZE_M": 1,
-        "num_warps": 2,
+        "num_warps": 4,
         "num_stages": 2,
         "waves_per_eu": 0,
         "matrix_instr_nonkdim": 16,
@@ -19,9 +19,31 @@
         "num_stages": 2,
         "waves_per_eu": 0,
         "matrix_instr_nonkdim": 16,
-        "kpack": 1
+        "kpack": 2
     },
     "4": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 2,
+        "waves_per_eu": 0,
+        "matrix_instr_nonkdim": 16,
+        "kpack": 2
+    },
+    "8": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0,
+        "matrix_instr_nonkdim": 16,
+        "kpack": 1
+    },
+    "16": {
         "BLOCK_SIZE_M": 16,
         "BLOCK_SIZE_N": 64,
         "BLOCK_SIZE_K": 128,
@@ -32,78 +54,56 @@
         "matrix_instr_nonkdim": 16,
         "kpack": 2
     },
-    "8": {
+    "24": {
         "BLOCK_SIZE_M": 16,
-        "BLOCK_SIZE_N": 16,
-        "BLOCK_SIZE_K": 256,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
         "GROUP_SIZE_M": 1,
-        "num_warps": 2,
+        "num_warps": 4,
         "num_stages": 2,
         "waves_per_eu": 0,
         "matrix_instr_nonkdim": 16,
         "kpack": 2
     },
-    "16": {
-        "BLOCK_SIZE_M": 64,
-        "BLOCK_SIZE_N": 64,
-        "BLOCK_SIZE_K": 64,
-        "GROUP_SIZE_M": 1,
-        "num_warps": 8,
-        "num_stages": 2,
-        "waves_per_eu": 0,
-        "matrix_instr_nonkdim": 16,
-        "kpack": 1
-    },
-    "24": {
+    "32": {
         "BLOCK_SIZE_M": 16,
         "BLOCK_SIZE_N": 64,
-        "BLOCK_SIZE_K": 64,
-        "GROUP_SIZE_M": 1,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 4,
         "num_warps": 4,
         "num_stages": 2,
         "waves_per_eu": 0,
         "matrix_instr_nonkdim": 16,
         "kpack": 1
     },
-    "32": {
-        "BLOCK_SIZE_M": 16,
-        "BLOCK_SIZE_N": 16,
-        "BLOCK_SIZE_K": 256,
-        "GROUP_SIZE_M": 4,
-        "num_warps": 2,
-        "num_stages": 2,
-        "waves_per_eu": 0,
-        "matrix_instr_nonkdim": 16,
-        "kpack": 2
-    },
     "48": {
         "BLOCK_SIZE_M": 16,
-        "BLOCK_SIZE_N": 128,
-        "BLOCK_SIZE_K": 64,
-        "GROUP_SIZE_M": 1,
-        "num_warps": 8,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 4,
+        "num_warps": 2,
         "num_stages": 2,
         "waves_per_eu": 0,
         "matrix_instr_nonkdim": 16,
         "kpack": 2
     },
     "64": {
-        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_M": 32,
         "BLOCK_SIZE_N": 64,
-        "BLOCK_SIZE_K": 64,
+        "BLOCK_SIZE_K": 128,
         "GROUP_SIZE_M": 1,
-        "num_warps": 2,
+        "num_warps": 4,
         "num_stages": 2,
         "waves_per_eu": 0,
         "matrix_instr_nonkdim": 16,
-        "kpack": 1
+        "kpack": 2
     },
     "96": {
         "BLOCK_SIZE_M": 32,
-        "BLOCK_SIZE_N": 32,
-        "BLOCK_SIZE_K": 256,
-        "GROUP_SIZE_M": 4,
-        "num_warps": 4,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
         "num_stages": 2,
         "waves_per_eu": 0,
         "matrix_instr_nonkdim": 16,
@@ -112,24 +112,24 @@
     "128": {
         "BLOCK_SIZE_M": 64,
         "BLOCK_SIZE_N": 64,
-        "BLOCK_SIZE_K": 64,
+        "BLOCK_SIZE_K": 128,
         "GROUP_SIZE_M": 4,
         "num_warps": 8,
         "num_stages": 2,
         "waves_per_eu": 0,
         "matrix_instr_nonkdim": 16,
-        "kpack": 1
+        "kpack": 2
     },
     "256": {
-        "BLOCK_SIZE_M": 128,
-        "BLOCK_SIZE_N": 128,
-        "BLOCK_SIZE_K": 64,
-        "GROUP_SIZE_M": 4,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
         "num_warps": 8,
         "num_stages": 2,
         "waves_per_eu": 0,
         "matrix_instr_nonkdim": 16,
-        "kpack": 1
+        "kpack": 2
     },
     "512": {
         "BLOCK_SIZE_M": 64,
@@ -151,7 +151,7 @@
         "num_stages": 2,
         "waves_per_eu": 0,
         "matrix_instr_nonkdim": 16,
-        "kpack": 1
+        "kpack": 2
     },
     "1536": {
         "BLOCK_SIZE_M": 128,
@@ -162,7 +162,7 @@
         "num_stages": 2,
         "waves_per_eu": 0,
         "matrix_instr_nonkdim": 16,
-        "kpack": 1
+        "kpack": 2
     },
     "2048": {
         "BLOCK_SIZE_M": 128,
@@ -184,7 +184,7 @@
         "num_stages": 2,
         "waves_per_eu": 0,
         "matrix_instr_nonkdim": 16,
-        "kpack": 1
+        "kpack": 2
     },
     "4096": {
         "BLOCK_SIZE_M": 128,
@@ -195,6 +195,6 @@
         "num_stages": 2,
         "waves_per_eu": 0,
         "matrix_instr_nonkdim": 16,
-        "kpack": 1
+        "kpack": 2
     }
 }
diff --git a/vllm/model_executor/layers/fused_moe/configs/E=8,N=2048,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8.json b/vllm/model_executor/layers/fused_moe/configs/E=8,N=2048,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8.json
new file mode 100644
index 0000000000000..81bb765d30031
--- /dev/null
+++ b/vllm/model_executor/layers/fused_moe/configs/E=8,N=2048,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8.json
@@ -0,0 +1,164 @@
+{
+    "1": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 16,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "2": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "4": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "8": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "16": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "24": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 2,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "32": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 2,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "48": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "64": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "96": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "128": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "256": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "512": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "1024": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "1536": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "2048": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "3072": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "4096": {
+        "BLOCK_SIZE_M": 256,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    }
+}
diff --git a/vllm/model_executor/layers/fused_moe/configs/E=8,N=2048,device_name=AMD_Instinct_MI300X.json b/vllm/model_executor/layers/fused_moe/configs/E=8,N=2048,device_name=AMD_Instinct_MI300X.json
new file mode 100644
index 0000000000000..811c77ab41093
--- /dev/null
+++ b/vllm/model_executor/layers/fused_moe/configs/E=8,N=2048,device_name=AMD_Instinct_MI300X.json
@@ -0,0 +1,200 @@
+{
+    "1": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0,
+        "matrix_instr_nonkdim": 16,
+        "kpack": 1
+    },
+    "2": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0,
+        "matrix_instr_nonkdim": 16,
+        "kpack": 2
+    },
+    "4": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 2,
+        "num_stages": 2,
+        "waves_per_eu": 0,
+        "matrix_instr_nonkdim": 16,
+        "kpack": 2
+    },
+    "8": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0,
+        "matrix_instr_nonkdim": 16,
+        "kpack": 2
+    },
+    "16": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 2,
+        "waves_per_eu": 0,
+        "matrix_instr_nonkdim": 16,
+        "kpack": 1
+    },
+    "24": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 2,
+        "num_stages": 2,
+        "waves_per_eu": 0,
+        "matrix_instr_nonkdim": 16,
+        "kpack": 1
+    },
+    "32": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 4,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0,
+        "matrix_instr_nonkdim": 16,
+        "kpack": 2
+    },
+    "48": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 2,
+        "num_stages": 2,
+        "waves_per_eu": 0,
+        "matrix_instr_nonkdim": 16,
+        "kpack": 2
+    },
+    "64": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 4,
+        "num_warps": 8,
+        "num_stages": 2,
+        "waves_per_eu": 0,
+        "matrix_instr_nonkdim": 16,
+        "kpack": 2
+    },
+    "96": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 4,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0,
+        "matrix_instr_nonkdim": 16,
+        "kpack": 1
+    },
+    "128": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 4,
+        "num_warps": 8,
+        "num_stages": 2,
+        "waves_per_eu": 0,
+        "matrix_instr_nonkdim": 16,
+        "kpack": 2
+    },
+    "256": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 2,
+        "waves_per_eu": 0,
+        "matrix_instr_nonkdim": 16,
+        "kpack": 2
+    },
+    "512": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 2,
+        "waves_per_eu": 0,
+        "matrix_instr_nonkdim": 32,
+        "kpack": 2
+    },
+    "1024": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 2,
+        "waves_per_eu": 0,
+        "matrix_instr_nonkdim": 16,
+        "kpack": 2
+    },
+    "1536": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 2,
+        "waves_per_eu": 0,
+        "matrix_instr_nonkdim": 16,
+        "kpack": 2
+    },
+    "2048": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 2,
+        "waves_per_eu": 0,
+        "matrix_instr_nonkdim": 16,
+        "kpack": 2
+    },
+    "3072": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 2,
+        "waves_per_eu": 0,
+        "matrix_instr_nonkdim": 16,
+        "kpack": 2
+    },
+    "4096": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 2,
+        "waves_per_eu": 0,
+        "matrix_instr_nonkdim": 16,
+        "kpack": 2
+    }
+}
diff --git a/vllm/model_executor/layers/fused_moe/configs/E=8,N=3584,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8.json b/vllm/model_executor/layers/fused_moe/configs/E=8,N=3584,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8.json
new file mode 100644
index 0000000000000..379ca107a9469
--- /dev/null
+++ b/vllm/model_executor/layers/fused_moe/configs/E=8,N=3584,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8.json
@@ -0,0 +1,164 @@
+{
+    "1": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "2": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "4": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "8": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 2,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "16": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 2,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "24": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "32": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "48": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 2,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "64": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 2,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "96": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "128": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 4,
+        "num_warps": 8,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "256": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "512": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "1024": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "1536": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "2048": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "3072": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 4,
+        "num_warps": 8,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "4096": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    }
+}
diff --git a/vllm/model_executor/layers/fused_moe/configs/E=8,N=3584,device_name=AMD_Instinct_MI300X.json b/vllm/model_executor/layers/fused_moe/configs/E=8,N=3584,device_name=AMD_Instinct_MI300X.json
index 3682cc548f352..ed5b655d89937 100644
--- a/vllm/model_executor/layers/fused_moe/configs/E=8,N=3584,device_name=AMD_Instinct_MI300X.json
+++ b/vllm/model_executor/layers/fused_moe/configs/E=8,N=3584,device_name=AMD_Instinct_MI300X.json
@@ -1,21 +1,21 @@
 {
     "1": {
         "BLOCK_SIZE_M": 16,
-        "BLOCK_SIZE_N": 16,
-        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 256,
         "GROUP_SIZE_M": 1,
-        "num_warps": 2,
+        "num_warps": 4,
         "num_stages": 2,
         "waves_per_eu": 0,
         "matrix_instr_nonkdim": 16,
-        "kpack": 1
+        "kpack": 2
     },
     "2": {
         "BLOCK_SIZE_M": 16,
-        "BLOCK_SIZE_N": 16,
-        "BLOCK_SIZE_K": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
         "GROUP_SIZE_M": 1,
-        "num_warps": 2,
+        "num_warps": 4,
         "num_stages": 2,
         "waves_per_eu": 0,
         "matrix_instr_nonkdim": 16,
@@ -23,10 +23,10 @@
     },
     "4": {
         "BLOCK_SIZE_M": 16,
-        "BLOCK_SIZE_N": 32,
-        "BLOCK_SIZE_K": 256,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
         "GROUP_SIZE_M": 1,
-        "num_warps": 2,
+        "num_warps": 4,
         "num_stages": 2,
         "waves_per_eu": 0,
         "matrix_instr_nonkdim": 16,
@@ -34,7 +34,7 @@
     },
     "8": {
         "BLOCK_SIZE_M": 16,
-        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_N": 16,
         "BLOCK_SIZE_K": 256,
         "GROUP_SIZE_M": 1,
         "num_warps": 2,
@@ -52,31 +52,9 @@
         "num_stages": 2,
         "waves_per_eu": 0,
         "matrix_instr_nonkdim": 16,
-        "kpack": 2
-    },
-    "24": {
-        "BLOCK_SIZE_M": 16,
-        "BLOCK_SIZE_N": 64,
-        "BLOCK_SIZE_K": 64,
-        "GROUP_SIZE_M": 1,
-        "num_warps": 4,
-        "num_stages": 2,
-        "waves_per_eu": 0,
-        "matrix_instr_nonkdim": 16,
         "kpack": 1
     },
-    "32": {
-        "BLOCK_SIZE_M": 16,
-        "BLOCK_SIZE_N": 16,
-        "BLOCK_SIZE_K": 256,
-        "GROUP_SIZE_M": 4,
-        "num_warps": 2,
-        "num_stages": 2,
-        "waves_per_eu": 0,
-        "matrix_instr_nonkdim": 16,
-        "kpack": 2
-    },
-    "48": {
+    "24": {
         "BLOCK_SIZE_M": 16,
         "BLOCK_SIZE_N": 32,
         "BLOCK_SIZE_K": 256,
@@ -87,6 +65,28 @@
         "matrix_instr_nonkdim": 16,
         "kpack": 2
     },
+    "32": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 4,
+        "num_warps": 2,
+        "num_stages": 2,
+        "waves_per_eu": 0,
+        "matrix_instr_nonkdim": 16,
+        "kpack": 1
+    },
+    "48": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 2,
+        "num_stages": 2,
+        "waves_per_eu": 0,
+        "matrix_instr_nonkdim": 16,
+        "kpack": 1
+    },
     "64": {
         "BLOCK_SIZE_M": 32,
         "BLOCK_SIZE_N": 32,
@@ -101,40 +101,40 @@
     "96": {
         "BLOCK_SIZE_M": 32,
         "BLOCK_SIZE_N": 32,
-        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_K": 256,
         "GROUP_SIZE_M": 4,
-        "num_warps": 4,
+        "num_warps": 2,
         "num_stages": 2,
         "waves_per_eu": 0,
         "matrix_instr_nonkdim": 16,
-        "kpack": 1
+        "kpack": 2
     },
     "128": {
         "BLOCK_SIZE_M": 64,
         "BLOCK_SIZE_N": 64,
-        "BLOCK_SIZE_K": 128,
-        "GROUP_SIZE_M": 4,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 1,
         "num_warps": 8,
         "num_stages": 2,
         "waves_per_eu": 0,
         "matrix_instr_nonkdim": 16,
-        "kpack": 1
+        "kpack": 2
     },
     "256": {
-        "BLOCK_SIZE_M": 128,
-        "BLOCK_SIZE_N": 128,
-        "BLOCK_SIZE_K": 64,
-        "GROUP_SIZE_M": 4,
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
         "num_warps": 8,
         "num_stages": 2,
         "waves_per_eu": 0,
         "matrix_instr_nonkdim": 16,
-        "kpack": 1
+        "kpack": 2
     },
     "512": {
         "BLOCK_SIZE_M": 64,
         "BLOCK_SIZE_N": 128,
-        "BLOCK_SIZE_K": 64,
+        "BLOCK_SIZE_K": 128,
         "GROUP_SIZE_M": 1,
         "num_warps": 8,
         "num_stages": 2,
@@ -151,7 +151,7 @@
         "num_stages": 2,
         "waves_per_eu": 0,
         "matrix_instr_nonkdim": 16,
-        "kpack": 1
+        "kpack": 2
     },
     "1536": {
         "BLOCK_SIZE_M": 128,
@@ -173,7 +173,7 @@
         "num_stages": 2,
         "waves_per_eu": 0,
         "matrix_instr_nonkdim": 16,
-        "kpack": 1
+        "kpack": 2
     },
     "3072": {
         "BLOCK_SIZE_M": 128,
@@ -195,6 +195,6 @@
         "num_stages": 2,
         "waves_per_eu": 0,
         "matrix_instr_nonkdim": 16,
-        "kpack": 1
+        "kpack": 2
     }
 }
diff --git a/vllm/model_executor/layers/fused_moe/configs/E=8,N=4096,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8.json b/vllm/model_executor/layers/fused_moe/configs/E=8,N=4096,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8.json
new file mode 100644
index 0000000000000..48bb5f2ccb8e3
--- /dev/null
+++ b/vllm/model_executor/layers/fused_moe/configs/E=8,N=4096,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8.json
@@ -0,0 +1,164 @@
+{
+    "1": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "2": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "4": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 2,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "8": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "16": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "24": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "32": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 2,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "48": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "64": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 2,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "96": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 16,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "128": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 4,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "256": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "512": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "1024": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "1536": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "2048": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "3072": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "4096": {
+        "BLOCK_SIZE_M": 256,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    }
+}
diff --git a/vllm/model_executor/layers/fused_moe/configs/E=8,N=4096,device_name=AMD_Instinct_MI300X.json b/vllm/model_executor/layers/fused_moe/configs/E=8,N=4096,device_name=AMD_Instinct_MI300X.json
new file mode 100644
index 0000000000000..a64d06c6d1724
--- /dev/null
+++ b/vllm/model_executor/layers/fused_moe/configs/E=8,N=4096,device_name=AMD_Instinct_MI300X.json
@@ -0,0 +1,200 @@
+{
+    "1": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 16,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 2,
+        "num_stages": 2,
+        "waves_per_eu": 0,
+        "matrix_instr_nonkdim": 16,
+        "kpack": 2
+    },
+    "2": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 16,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 2,
+        "num_stages": 2,
+        "waves_per_eu": 0,
+        "matrix_instr_nonkdim": 16,
+        "kpack": 2
+    },
+    "4": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 16,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0,
+        "matrix_instr_nonkdim": 16,
+        "kpack": 1
+    },
+    "8": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 16,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0,
+        "matrix_instr_nonkdim": 16,
+        "kpack": 2
+    },
+    "16": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 16,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0,
+        "matrix_instr_nonkdim": 16,
+        "kpack": 2
+    },
+    "24": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 16,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0,
+        "matrix_instr_nonkdim": 16,
+        "kpack": 1
+    },
+    "32": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 4,
+        "num_warps": 2,
+        "num_stages": 2,
+        "waves_per_eu": 0,
+        "matrix_instr_nonkdim": 16,
+        "kpack": 2
+    },
+    "48": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0,
+        "matrix_instr_nonkdim": 16,
+        "kpack": 2
+    },
+    "64": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 4,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0,
+        "matrix_instr_nonkdim": 16,
+        "kpack": 1
+    },
+    "96": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 4,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0,
+        "matrix_instr_nonkdim": 16,
+        "kpack": 2
+    },
+    "128": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0,
+        "matrix_instr_nonkdim": 16,
+        "kpack": 1
+    },
+    "256": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 2,
+        "waves_per_eu": 0,
+        "matrix_instr_nonkdim": 32,
+        "kpack": 2
+    },
+    "512": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 2,
+        "waves_per_eu": 0,
+        "matrix_instr_nonkdim": 16,
+        "kpack": 2
+    },
+    "1024": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 2,
+        "waves_per_eu": 0,
+        "matrix_instr_nonkdim": 16,
+        "kpack": 2
+    },
+    "1536": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 2,
+        "waves_per_eu": 0,
+        "matrix_instr_nonkdim": 16,
+        "kpack": 2
+    },
+    "2048": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 2,
+        "waves_per_eu": 0,
+        "matrix_instr_nonkdim": 16,
+        "kpack": 2
+    },
+    "3072": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 2,
+        "waves_per_eu": 0,
+        "matrix_instr_nonkdim": 16,
+        "kpack": 2
+    },
+    "4096": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 2,
+        "waves_per_eu": 0,
+        "matrix_instr_nonkdim": 16,
+        "kpack": 2
+    }
+}
diff --git a/vllm/model_executor/layers/fused_moe/configs/E=8,N=7168,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8.json b/vllm/model_executor/layers/fused_moe/configs/E=8,N=7168,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8.json
new file mode 100644
index 0000000000000..bd2c6fbc1b941
--- /dev/null
+++ b/vllm/model_executor/layers/fused_moe/configs/E=8,N=7168,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8.json
@@ -0,0 +1,164 @@
+{
+    "1": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "2": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "4": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "8": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "16": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 2,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "24": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 2,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "32": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 2,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "48": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "64": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 4,
+        "num_warps": 2,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "96": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "128": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 4,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "256": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 4,
+        "num_warps": 8,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "512": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "1024": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "1536": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "2048": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "3072": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "4096": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    }
+}
diff --git a/vllm/model_executor/layers/fused_moe/configs/E=8,N=7168,device_name=AMD_Instinct_MI300X.json b/vllm/model_executor/layers/fused_moe/configs/E=8,N=7168,device_name=AMD_Instinct_MI300X.json
index 21742854c613f..822f04e33e879 100644
--- a/vllm/model_executor/layers/fused_moe/configs/E=8,N=7168,device_name=AMD_Instinct_MI300X.json
+++ b/vllm/model_executor/layers/fused_moe/configs/E=8,N=7168,device_name=AMD_Instinct_MI300X.json
@@ -1,7 +1,7 @@
 {
     "1": {
         "BLOCK_SIZE_M": 16,
-        "BLOCK_SIZE_N": 16,
+        "BLOCK_SIZE_N": 32,
         "BLOCK_SIZE_K": 256,
         "GROUP_SIZE_M": 1,
         "num_warps": 2,
@@ -12,54 +12,54 @@
     },
     "2": {
         "BLOCK_SIZE_M": 16,
-        "BLOCK_SIZE_N": 64,
-        "BLOCK_SIZE_K": 32,
+        "BLOCK_SIZE_N": 16,
+        "BLOCK_SIZE_K": 256,
         "GROUP_SIZE_M": 1,
-        "num_warps": 4,
+        "num_warps": 2,
         "num_stages": 2,
         "waves_per_eu": 0,
         "matrix_instr_nonkdim": 16,
-        "kpack": 1
+        "kpack": 2
     },
     "4": {
         "BLOCK_SIZE_M": 16,
-        "BLOCK_SIZE_N": 32,
-        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_N": 16,
+        "BLOCK_SIZE_K": 256,
         "GROUP_SIZE_M": 1,
-        "num_warps": 4,
+        "num_warps": 2,
         "num_stages": 2,
         "waves_per_eu": 0,
         "matrix_instr_nonkdim": 16,
-        "kpack": 1
+        "kpack": 2
     },
     "8": {
         "BLOCK_SIZE_M": 16,
-        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_N": 16,
         "BLOCK_SIZE_K": 256,
         "GROUP_SIZE_M": 1,
+        "num_warps": 1,
+        "num_stages": 2,
+        "waves_per_eu": 0,
+        "matrix_instr_nonkdim": 16,
+        "kpack": 2
+    },
+    "16": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
         "num_warps": 2,
         "num_stages": 2,
         "waves_per_eu": 0,
         "matrix_instr_nonkdim": 16,
         "kpack": 1
     },
-    "16": {
+    "24": {
         "BLOCK_SIZE_M": 16,
         "BLOCK_SIZE_N": 16,
         "BLOCK_SIZE_K": 256,
         "GROUP_SIZE_M": 1,
-        "num_warps": 4,
-        "num_stages": 2,
-        "waves_per_eu": 0,
-        "matrix_instr_nonkdim": 16,
-        "kpack": 2
-    },
-    "24": {
-        "BLOCK_SIZE_M": 32,
-        "BLOCK_SIZE_N": 32,
-        "BLOCK_SIZE_K": 128,
-        "GROUP_SIZE_M": 1,
-        "num_warps": 8,
+        "num_warps": 1,
         "num_stages": 2,
         "waves_per_eu": 0,
         "matrix_instr_nonkdim": 16,
@@ -68,7 +68,7 @@
     "32": {
         "BLOCK_SIZE_M": 16,
         "BLOCK_SIZE_N": 32,
-        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_K": 256,
         "GROUP_SIZE_M": 4,
         "num_warps": 2,
         "num_stages": 2,
@@ -78,52 +78,52 @@
     },
     "48": {
         "BLOCK_SIZE_M": 16,
-        "BLOCK_SIZE_N": 32,
-        "BLOCK_SIZE_K": 128,
-        "GROUP_SIZE_M": 4,
+        "BLOCK_SIZE_N": 16,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 1,
         "num_warps": 2,
         "num_stages": 2,
         "waves_per_eu": 0,
         "matrix_instr_nonkdim": 16,
-        "kpack": 1
-    },
-    "64": {
-        "BLOCK_SIZE_M": 32,
-        "BLOCK_SIZE_N": 32,
-        "BLOCK_SIZE_K": 128,
-        "GROUP_SIZE_M": 4,
-        "num_warps": 4,
-        "num_stages": 2,
-        "waves_per_eu": 0,
-        "matrix_instr_nonkdim": 16,
         "kpack": 2
     },
-    "96": {
+    "64": {
         "BLOCK_SIZE_M": 32,
-        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_N": 64,
         "BLOCK_SIZE_K": 128,
         "GROUP_SIZE_M": 4,
         "num_warps": 4,
         "num_stages": 2,
         "waves_per_eu": 0,
         "matrix_instr_nonkdim": 16,
+        "kpack": 1
+    },
+    "96": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 4,
+        "num_warps": 8,
+        "num_stages": 2,
+        "waves_per_eu": 0,
+        "matrix_instr_nonkdim": 16,
         "kpack": 2
     },
     "128": {
         "BLOCK_SIZE_M": 64,
         "BLOCK_SIZE_N": 64,
-        "BLOCK_SIZE_K": 64,
+        "BLOCK_SIZE_K": 128,
         "GROUP_SIZE_M": 4,
         "num_warps": 8,
         "num_stages": 2,
         "waves_per_eu": 0,
         "matrix_instr_nonkdim": 16,
-        "kpack": 1
+        "kpack": 2
     },
     "256": {
         "BLOCK_SIZE_M": 128,
         "BLOCK_SIZE_N": 128,
-        "BLOCK_SIZE_K": 64,
+        "BLOCK_SIZE_K": 128,
         "GROUP_SIZE_M": 4,
         "num_warps": 8,
         "num_stages": 2,
@@ -140,7 +140,7 @@
         "num_stages": 2,
         "waves_per_eu": 0,
         "matrix_instr_nonkdim": 16,
-        "kpack": 1
+        "kpack": 2
     },
     "1024": {
         "BLOCK_SIZE_M": 128,
@@ -151,7 +151,7 @@
         "num_stages": 2,
         "waves_per_eu": 0,
         "matrix_instr_nonkdim": 16,
-        "kpack": 1
+        "kpack": 2
     },
     "1536": {
         "BLOCK_SIZE_M": 128,
@@ -173,7 +173,7 @@
         "num_stages": 2,
         "waves_per_eu": 0,
         "matrix_instr_nonkdim": 16,
-        "kpack": 1
+        "kpack": 2
     },
     "3072": {
         "BLOCK_SIZE_M": 128,
@@ -187,7 +187,7 @@
         "kpack": 2
     },
     "4096": {
-        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_M": 256,
         "BLOCK_SIZE_N": 128,
         "BLOCK_SIZE_K": 64,
         "GROUP_SIZE_M": 1,
@@ -195,6 +195,6 @@
         "num_stages": 2,
         "waves_per_eu": 0,
         "matrix_instr_nonkdim": 16,
-        "kpack": 1
+        "kpack": 2
     }
 }
diff --git a/vllm/model_executor/layers/fused_moe/configs/E=8,N=8192,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8.json b/vllm/model_executor/layers/fused_moe/configs/E=8,N=8192,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8.json
new file mode 100644
index 0000000000000..cd4fb8f11b935
--- /dev/null
+++ b/vllm/model_executor/layers/fused_moe/configs/E=8,N=8192,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8.json
@@ -0,0 +1,164 @@
+{
+    "1": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 2,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "2": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "4": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 2,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "8": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 2,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "16": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 1,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "24": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 2,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "32": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 4,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "48": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 2,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "64": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 4,
+        "num_warps": 1,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "96": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "128": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 4,
+        "num_warps": 2,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "256": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "512": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "1024": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "1536": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "2048": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "3072": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "4096": {
+        "BLOCK_SIZE_M": 256,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    }
+}
diff --git a/vllm/model_executor/layers/fused_moe/configs/E=8,N=8192,device_name=AMD_Instinct_MI300X.json b/vllm/model_executor/layers/fused_moe/configs/E=8,N=8192,device_name=AMD_Instinct_MI300X.json
new file mode 100644
index 0000000000000..cf66868e9d57a
--- /dev/null
+++ b/vllm/model_executor/layers/fused_moe/configs/E=8,N=8192,device_name=AMD_Instinct_MI300X.json
@@ -0,0 +1,200 @@
+{
+    "1": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 16,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 1,
+        "num_stages": 2,
+        "waves_per_eu": 0,
+        "matrix_instr_nonkdim": 16,
+        "kpack": 1
+    },
+    "2": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 2,
+        "num_stages": 2,
+        "waves_per_eu": 0,
+        "matrix_instr_nonkdim": 16,
+        "kpack": 2
+    },
+    "4": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0,
+        "matrix_instr_nonkdim": 16,
+        "kpack": 2
+    },
+    "8": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 16,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 1,
+        "num_stages": 2,
+        "waves_per_eu": 0,
+        "matrix_instr_nonkdim": 16,
+        "kpack": 1
+    },
+    "16": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 2,
+        "num_stages": 2,
+        "waves_per_eu": 0,
+        "matrix_instr_nonkdim": 16,
+        "kpack": 2
+    },
+    "24": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 2,
+        "num_stages": 2,
+        "waves_per_eu": 0,
+        "matrix_instr_nonkdim": 16,
+        "kpack": 2
+    },
+    "32": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 4,
+        "num_warps": 2,
+        "num_stages": 2,
+        "waves_per_eu": 0,
+        "matrix_instr_nonkdim": 16,
+        "kpack": 2
+    },
+    "48": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 2,
+        "waves_per_eu": 0,
+        "matrix_instr_nonkdim": 16,
+        "kpack": 2
+    },
+    "64": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 4,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0,
+        "matrix_instr_nonkdim": 16,
+        "kpack": 2
+    },
+    "96": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 4,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0,
+        "matrix_instr_nonkdim": 16,
+        "kpack": 2
+    },
+    "128": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 4,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0,
+        "matrix_instr_nonkdim": 16,
+        "kpack": 2
+    },
+    "256": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0,
+        "matrix_instr_nonkdim": 16,
+        "kpack": 2
+    },
+    "512": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 2,
+        "waves_per_eu": 0,
+        "matrix_instr_nonkdim": 16,
+        "kpack": 2
+    },
+    "1024": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 2,
+        "waves_per_eu": 0,
+        "matrix_instr_nonkdim": 16,
+        "kpack": 2
+    },
+    "1536": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 2,
+        "waves_per_eu": 0,
+        "matrix_instr_nonkdim": 16,
+        "kpack": 2
+    },
+    "2048": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 2,
+        "waves_per_eu": 0,
+        "matrix_instr_nonkdim": 16,
+        "kpack": 2
+    },
+    "3072": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 2,
+        "waves_per_eu": 0,
+        "matrix_instr_nonkdim": 16,
+        "kpack": 2
+    },
+    "4096": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 2,
+        "waves_per_eu": 0,
+        "matrix_instr_nonkdim": 16,
+        "kpack": 2
+    }
+}
diff --git a/vllm/model_executor/layers/fused_moe/fused_marlin_moe.py b/vllm/model_executor/layers/fused_moe/fused_marlin_moe.py
index 87993267c05b5..4ca569ca4f19b 100644
--- a/vllm/model_executor/layers/fused_moe/fused_marlin_moe.py
+++ b/vllm/model_executor/layers/fused_moe/fused_marlin_moe.py
@@ -1,3 +1,4 @@
+# SPDX-License-Identifier: Apache-2.0
 """Fused MoE utilities for GPTQ."""
 import functools
 from typing import Optional
diff --git a/vllm/model_executor/layers/fused_moe/fused_moe.py b/vllm/model_executor/layers/fused_moe/fused_moe.py
index 308c1d6ac6db1..1bed35525e9d1 100644
--- a/vllm/model_executor/layers/fused_moe/fused_moe.py
+++ b/vllm/model_executor/layers/fused_moe/fused_moe.py
@@ -1,3 +1,4 @@
+# SPDX-License-Identifier: Apache-2.0
 """Fused MoE kernel."""
 import functools
 import json
@@ -19,6 +20,206 @@ from vllm.utils import direct_register_custom_op
 logger = init_logger(__name__)
 
 
+@triton.jit
+def fused_moe_kernel_gptq_awq(
+        # Pointers to matrices
+        a_ptr,
+        b_ptr,
+        c_ptr,
+        b_scale_ptr,
+        b_zp_ptr,
+        topk_weights_ptr,
+        sorted_token_ids_ptr,
+        expert_ids_ptr,
+        num_tokens_post_padded_ptr,
+        # Matrix dimensions
+        N: tl.constexpr,
+        K: tl.constexpr,
+        EM,
+        num_valid_tokens,
+        # The stride variables represent how much to increase the ptr by when
+        # moving by 1 element in a particular dimension. E.g. `stride_am` is
+        # how much to increase `a_ptr` by to get the element one row down
+        # (A has M rows).
+        stride_am,
+        stride_ak,
+        stride_be,
+        stride_bk,
+        stride_bn,
+        stride_cm,
+        stride_cn,
+        stride_bse,
+        stride_bsk,
+        stride_bsn,
+        stride_bze,
+        stride_bzk,
+        stride_bzn,
+        block_k_diviable: tl.constexpr,
+        group_size: tl.constexpr,
+        # Meta-parameters
+        BLOCK_SIZE_M: tl.constexpr,
+        BLOCK_SIZE_N: tl.constexpr,
+        BLOCK_SIZE_K: tl.constexpr,
+        GROUP_SIZE_M: tl.constexpr,
+        MUL_ROUTED_WEIGHT: tl.constexpr,
+        top_k: tl.constexpr,
+        compute_type: tl.constexpr,
+        has_zp: tl.constexpr,
+        use_int4_w4a16: tl.constexpr,
+        use_int8_w8a16: tl.constexpr):
+    """
+    Implements the fused computation for a Mixture of Experts (MOE) using
+    token and expert matrices.
+
+    Key Parameters:
+    - A: The input tensor representing tokens with shape (*, K), where '*' can
+        be any shape representing batches and K is the feature dimension of
+        each token.
+    - B: The stacked MOE weight tensor with shape (E, N, K), where E is
+        the number of experts, K is the input feature dimension, and N is
+        the output feature dimension.
+    - C: The output cache tensor with shape (M, topk, N), where M is the
+        total number of tokens post padding, topk is the number of times
+        each token is repeated, and N is the output feature dimension.
+    - sorted_token_ids: A tensor containing the sorted indices of tokens,
+        repeated topk times and arranged by the expert index they are
+        assigned to.
+    - expert_ids: A tensor containing the indices of the expert for each
+        block. It determines which expert matrix from B should be used for
+        each block in A.
+    This kernel performs the multiplication of a token by its corresponding
+    expert matrix as determined by `expert_ids`. The sorting of
+    `sorted_token_ids` by expert index and padding ensures divisibility by
+    BLOCK_SIZE_M, which is necessary to maintain consistency in block matrix
+    multiplication across different blocks processed by the same expert.
+    """
+    # -----------------------------------------------------------
+    # Map program ids `pid` to the block of C it should compute.
+    # This is done in a grouped ordering to promote L2 data reuse.
+    pid = tl.program_id(axis=0)
+    num_pid_m = tl.cdiv(EM, BLOCK_SIZE_M)
+    num_pid_n = tl.cdiv(N, BLOCK_SIZE_N)
+    num_pid_in_group = GROUP_SIZE_M * num_pid_n
+    group_id = pid // num_pid_in_group
+    first_pid_m = group_id * GROUP_SIZE_M
+    group_size_m = min(num_pid_m - first_pid_m, GROUP_SIZE_M)
+    pid_m = first_pid_m + ((pid % num_pid_in_group) % group_size_m)
+    pid_n = (pid % num_pid_in_group) // group_size_m
+
+    # ----------------------------------------------------------
+    # Create pointers for the first blocks of A and B.
+    # We will advance this pointer as we move in the K direction
+    # and accumulate
+    # `a_ptrs` is a block of [BLOCK_SIZE_M, BLOCK_SIZE_K] pointers
+    # `b_ptrs` is a block of [BLOCK_SIZE_K, BLOCK_SIZE_N] pointers
+    num_tokens_post_padded = tl.load(num_tokens_post_padded_ptr)
+    if pid_m * BLOCK_SIZE_M >= num_tokens_post_padded:
+        return
+    offs_token_id = pid_m * BLOCK_SIZE_M + tl.arange(0, BLOCK_SIZE_M).to(
+        tl.int64)
+    offs_token = tl.load(sorted_token_ids_ptr + offs_token_id)
+    token_mask = offs_token < num_valid_tokens
+
+    offs_bn = (pid_n * BLOCK_SIZE_N +
+               tl.arange(0, BLOCK_SIZE_N).to(tl.int64)) % N
+    offs_k = tl.arange(0, BLOCK_SIZE_K)
+    a_ptrs = a_ptr + (offs_token[:, None] // top_k * stride_am +
+                      offs_k[None, :] * stride_ak)
+
+    off_experts = tl.load(expert_ids_ptr + pid_m).to(tl.int64)
+
+    if use_int4_w4a16:
+        b_ptrs = b_ptr + off_experts * stride_be + \
+            (offs_k[:, None] // 2) * stride_bk + offs_bn[None, :] * stride_bn
+        b_shifter = (offs_k[:, None] % 2) * 4
+    elif use_int8_w8a16:
+        b_ptrs = b_ptr + off_experts * stride_be + \
+            offs_k[:, None] * stride_bk + offs_bn[None, :] * stride_bn
+
+    if not has_zp and use_int4_w4a16:
+        b_zp_num = 8
+    if not has_zp and use_int8_w8a16:
+        b_zp_num = 128
+    elif has_zp and use_int4_w4a16:
+        b_zp_shifter = (offs_bn[None, :] % 2) * 4
+
+    # -----------------------------------------------------------
+    # Iterate to compute a block of the C matrix.
+    # We accumulate into a `[BLOCK_SIZE_M, BLOCK_SIZE_N]` block
+    # of fp32 values for higher accuracy.
+    # `accumulator` will be converted back to fp16 after the loop.
+    accumulator = tl.zeros((BLOCK_SIZE_M, BLOCK_SIZE_N), dtype=tl.float32)
+    for k in range(0, tl.cdiv(K, BLOCK_SIZE_K)):
+        # Load the next block of A and B, generate a mask by checking the
+        # K dimension.
+
+        if not block_k_diviable:
+            k_mask = offs_k[:, None] < K - k * BLOCK_SIZE_K
+            k_other = 0.0
+        else:
+            k_mask = None
+            k_other = None
+
+        a = tl.load(a_ptrs,
+                    mask=token_mask[:, None] &
+                    (offs_k[None, :] < K - k * BLOCK_SIZE_K),
+                    other=0.0)
+        b = tl.load(b_ptrs)
+        if use_int4_w4a16:
+            b = (b >> b_shifter) & 0xF
+
+        b_scale_ptrs = b_scale_ptr + off_experts * stride_bse + \
+            offs_bn[None, :] * stride_bsn + \
+            ((offs_k[:, None] + BLOCK_SIZE_K * k) // group_size) * stride_bsk
+        b_scale = tl.load(b_scale_ptrs, mask=k_mask, other=k_other)
+        b_scale = b_scale.to(tl.float32)
+
+        if has_zp and use_int4_w4a16:
+            offs_k_true = (offs_k[:, None] + BLOCK_SIZE_K * k) // group_size
+            b_zp_ptrs = b_zp_ptr + off_experts * stride_bze + \
+                (offs_bn[None, :] // 2) * stride_bzn + \
+                offs_k_true * stride_bzk
+            b_zp = tl.load(b_zp_ptrs, mask=k_mask, other=k_other)
+            b_zp = ((b_zp >> b_zp_shifter) & 0xF)
+            b_zp = b_zp.to(tl.float32)
+        elif has_zp and use_int8_w8a16:
+            offs_k_true = (offs_k[:, None] + BLOCK_SIZE_K * k) // group_size
+            b_zp_ptrs = b_zp_ptr + off_experts * stride_bze + \
+                offs_bn[None, :] * stride_bzn + \
+                offs_k_true * stride_bzk
+            b_zp = tl.load(b_zp_ptrs, mask=k_mask, other=k_other)
+            b_zp = b_zp.to(tl.float32)
+
+        # We accumulate along the K dimension.
+        if has_zp:
+            b = ((b.to(tl.float32) - b_zp) * b_scale).to(compute_type)
+        else:
+            b = ((b.to(tl.float32) - b_zp_num) * b_scale).to(compute_type)
+        accumulator = tl.dot(a, b, acc=accumulator)
+
+        # Advance the ptrs to the next K block.
+        a_ptrs += BLOCK_SIZE_K * stride_ak
+        if use_int4_w4a16:
+            b_ptrs += (BLOCK_SIZE_K // 2) * stride_bk
+        else:
+            b_ptrs += BLOCK_SIZE_K * stride_bk
+
+    if MUL_ROUTED_WEIGHT:
+        moe_weight = tl.load(topk_weights_ptr + offs_token,
+                             mask=token_mask,
+                             other=0)
+        accumulator = accumulator * moe_weight[:, None]
+
+    accumulator = accumulator.to(compute_type)
+    # -----------------------------------------------------------
+    # Write back the block of the output
+    offs_cn = pid_n * BLOCK_SIZE_N + tl.arange(0, BLOCK_SIZE_N)
+    c_ptrs = c_ptr + stride_cm * offs_token[:, None] + stride_cn * offs_cn[
+        None, :]
+    c_mask = token_mask[:, None] & (offs_cn[None, :] < N)
+    tl.store(c_ptrs, accumulator, mask=c_mask)
+
+
 @triton.jit
 def fused_moe_kernel(
         # Pointers to matrices
@@ -204,6 +405,144 @@ def fused_moe_kernel(
     tl.store(c_ptrs, accumulator, mask=c_mask)
 
 
+def ceil_div(a, b):
+    return (a + b - 1) // b
+
+
+@triton.jit
+def moe_align_block_size_stage1(
+    topk_ids_ptr,
+    tokens_cnts_ptr,
+    num_experts: tl.constexpr,
+    numel: tl.constexpr,
+    tokens_per_thread: tl.constexpr,
+):
+    pid = tl.program_id(0)
+
+    start_idx = pid * tokens_per_thread
+
+    off_c = (pid + 1) * num_experts
+
+    for i in range(tokens_per_thread):
+        if start_idx + i < numel:
+            idx = tl.load(topk_ids_ptr + start_idx + i)
+            token_cnt = tl.load(tokens_cnts_ptr + off_c + idx)
+            tl.store(tokens_cnts_ptr + off_c + idx, token_cnt + 1)
+
+
+@triton.jit
+def moe_align_block_size_stage2(
+    tokens_cnts_ptr,
+    num_experts: tl.constexpr,
+):
+    pid = tl.program_id(0)
+
+    last_cnt = 0
+    for i in range(1, num_experts + 1):
+        token_cnt = tl.load(tokens_cnts_ptr + i * num_experts + pid)
+        last_cnt = last_cnt + token_cnt
+        tl.store(tokens_cnts_ptr + i * num_experts + pid, last_cnt)
+
+
+@triton.jit
+def moe_align_block_size_stage3(
+    total_tokens_post_pad_ptr,
+    tokens_cnts_ptr,
+    cumsum_ptr,
+    num_experts: tl.constexpr,
+    block_size: tl.constexpr,
+):
+    last_cumsum = 0
+    off_cnt = num_experts * num_experts
+    for i in range(1, num_experts + 1):
+        token_cnt = tl.load(tokens_cnts_ptr + off_cnt + i - 1)
+        last_cumsum = last_cumsum + tl.cdiv(token_cnt, block_size) * block_size
+        tl.store(cumsum_ptr + i, last_cumsum)
+    tl.store(total_tokens_post_pad_ptr, last_cumsum)
+
+
+@triton.jit
+def moe_align_block_size_stage4(
+    topk_ids_ptr,
+    sorted_token_ids_ptr,
+    expert_ids_ptr,
+    tokens_cnts_ptr,
+    cumsum_ptr,
+    num_experts: tl.constexpr,
+    block_size: tl.constexpr,
+    numel: tl.constexpr,
+    tokens_per_thread: tl.constexpr,
+):
+    pid = tl.program_id(0)
+    start_idx = tl.load(cumsum_ptr + pid)
+    end_idx = tl.load(cumsum_ptr + pid + 1)
+
+    for i in range(start_idx, end_idx, block_size):
+        tl.store(expert_ids_ptr + i // block_size, pid)
+
+    start_idx = pid * tokens_per_thread
+    off_t = pid * num_experts
+
+    for i in range(start_idx, tl.minimum(start_idx + tokens_per_thread,
+                                         numel)):
+        expert_id = tl.load(topk_ids_ptr + i)
+        token_cnt = tl.load(tokens_cnts_ptr + off_t + expert_id)
+        rank_post_pad = token_cnt + tl.load(cumsum_ptr + expert_id)
+        tl.store(sorted_token_ids_ptr + rank_post_pad, i)
+        tl.store(tokens_cnts_ptr + off_t + expert_id, token_cnt + 1)
+
+
+# Triton implementation based on:
+# https://github.com/sgl-project/sglang/commit/ba5112ff691d791a9e38c6c71f59324a5fcb49d0
+def moe_align_block_size_triton(
+    topk_ids: torch.Tensor,
+    num_experts: int,
+    block_size: int,
+    sorted_token_ids: torch.Tensor,
+    expert_ids: torch.Tensor,
+    num_tokens_post_pad: torch.Tensor,
+) -> None:
+    numel = topk_ids.numel()
+    grid = (num_experts, )
+    tokens_cnts = torch.zeros((num_experts + 1, num_experts),
+                              dtype=torch.int32,
+                              device=topk_ids.device)
+    cumsum = torch.zeros((num_experts + 1, ),
+                         dtype=torch.int32,
+                         device=topk_ids.device)
+    tokens_per_thread = ceil_div(numel, num_experts)
+
+    moe_align_block_size_stage1[grid](
+        topk_ids,
+        tokens_cnts,
+        num_experts,
+        numel,
+        tokens_per_thread,
+    )
+    moe_align_block_size_stage2[grid](
+        tokens_cnts,
+        num_experts,
+    )
+    moe_align_block_size_stage3[(1, )](
+        num_tokens_post_pad,
+        tokens_cnts,
+        cumsum,
+        num_experts,
+        block_size,
+    )
+    moe_align_block_size_stage4[grid](
+        topk_ids,
+        sorted_token_ids,
+        expert_ids,
+        tokens_cnts,
+        cumsum,
+        num_experts,
+        block_size,
+        numel,
+        tokens_per_thread,
+    )
+
+
 def moe_align_block_size(
         topk_ids: torch.Tensor, block_size: int,
         num_experts: int) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
@@ -256,8 +595,28 @@ def moe_align_block_size(
     num_tokens_post_pad = torch.empty((1),
                                       dtype=torch.int32,
                                       device=topk_ids.device)
-    ops.moe_align_block_size(topk_ids, num_experts, block_size, sorted_ids,
-                             expert_ids, num_tokens_post_pad)
+    if num_experts >= 224:
+        if envs.VLLM_ENABLE_MOE_ALIGN_BLOCK_SIZE_TRITON:
+            moe_align_block_size_triton(
+                topk_ids,
+                num_experts,
+                block_size,
+                sorted_ids,
+                expert_ids,
+                num_tokens_post_pad,
+            )
+        else:
+            ops.sgl_moe_align_block_size(
+                topk_ids,
+                num_experts,
+                block_size,
+                sorted_ids,
+                expert_ids,
+                num_tokens_post_pad,
+            )
+    else:
+        ops.moe_align_block_size(topk_ids, num_experts, block_size, sorted_ids,
+                                 expert_ids, num_tokens_post_pad)
     return sorted_ids, expert_ids, num_tokens_post_pad
 
 
@@ -266,6 +625,7 @@ def invoke_fused_moe_kernel(A: torch.Tensor,
                             C: torch.Tensor,
                             A_scale: Optional[torch.Tensor],
                             B_scale: Optional[torch.Tensor],
+                            B_zp: Optional[torch.Tensor],
                             topk_weights: torch.Tensor,
                             topk_ids: torch.Tensor,
                             sorted_token_ids: torch.Tensor,
@@ -277,6 +637,7 @@ def invoke_fused_moe_kernel(A: torch.Tensor,
                             compute_type: tl.dtype,
                             use_fp8_w8a8: bool,
                             use_int8_w8a16: bool,
+                            use_int4_w4a16: bool,
                             block_shape: Optional[List[int]] = None) -> None:
     assert topk_weights.stride(1) == 1
     assert sorted_token_ids.stride(0) == 1
@@ -292,61 +653,131 @@ def invoke_fused_moe_kernel(A: torch.Tensor,
             assert triton.cdiv(A.shape[-1], block_k) == A_scale.shape[-1]
             assert triton.cdiv(B.shape[-2], block_n) == B_scale.shape[-2]
             assert triton.cdiv(B.shape[-1], block_k) == B_scale.shape[-1]
-    elif use_int8_w8a16:
+    elif use_int8_w8a16 or use_int4_w4a16:
         assert B_scale is not None
+        assert block_shape is None or block_shape[0] == 0
     else:
         assert A_scale is None
         assert B_scale is None
 
-    grid = lambda META: (triton.cdiv(sorted_token_ids.shape[0], META[
-        'BLOCK_SIZE_M']) * triton.cdiv(B.shape[1], META['BLOCK_SIZE_N']), )
+    EM = sorted_token_ids.shape[0]
+    if A.shape[0] < config["BLOCK_SIZE_M"]:
+        # optimize for small batch_size.
+        # We assume that top_ids of each token is unique, so
+        # so num_valid_experts <= batch_size <= BLOCK_SIZE_M,
+        # and we can skip some invalid blocks.
+        EM = min(sorted_token_ids.shape[0],
+                 A.shape[0] * top_k * config['BLOCK_SIZE_M'])
+    grid = lambda META: (triton.cdiv(EM, META['BLOCK_SIZE_M']) * triton.cdiv(
+        B.shape[1], META['BLOCK_SIZE_N']), )
 
-    fused_moe_kernel[grid](
-        A,
-        B,
-        C,
-        A_scale,
-        B_scale,
-        topk_weights,
-        sorted_token_ids,
-        expert_ids,
-        num_tokens_post_padded,
-        B.shape[1],
-        B.shape[2],
-        sorted_token_ids.shape[0],
-        topk_ids.numel(),
-        A.stride(0),
-        A.stride(1),
-        B.stride(0),
-        B.stride(2),
-        B.stride(1),
-        C.stride(1),
-        C.stride(2),
-        A_scale.stride(0) if A_scale is not None and A_scale.ndim == 2 else 0,
-        A_scale.stride(1) if A_scale is not None and A_scale.ndim == 2 else 0,
-        B_scale.stride(0) if B_scale is not None and B_scale.ndim >= 2 else 0,
-        B_scale.stride(2) if B_scale is not None and B_scale.ndim == 3 else 0,
-        B_scale.stride(1) if B_scale is not None and B_scale.ndim >= 2 else 0,
-        0 if block_shape is None else block_shape[0],
-        0 if block_shape is None else block_shape[1],
-        MUL_ROUTED_WEIGHT=mul_routed_weight,
-        top_k=top_k,
-        compute_type=compute_type,
-        use_fp8_w8a8=use_fp8_w8a8,
-        use_int8_w8a16=use_int8_w8a16,
-        **config,
-    )
+    if (use_int8_w8a16 or use_int4_w4a16) and \
+            block_shape is not None and block_shape[1] > 0:
+        assert B_scale is not None and B_scale.ndim == 3
+        assert B_zp is None or B_zp.ndim == 3
+
+        fused_moe_kernel_gptq_awq[grid](
+            A,
+            B,
+            C,
+            B_scale,
+            B_zp,
+            topk_weights,
+            sorted_token_ids,
+            expert_ids,
+            num_tokens_post_padded,
+            B.shape[1],
+            A.shape[1],
+            EM,
+            topk_ids.numel(),
+            A.stride(0),
+            A.stride(1),
+            B.stride(0),
+            B.stride(2),
+            B.stride(1),
+            C.stride(1),
+            C.stride(2),
+            B_scale.stride(0),
+            B_scale.stride(2),
+            B_scale.stride(1),
+            B_zp.stride(0) if B_zp is not None else 0,
+            B_zp.stride(2) if B_zp is not None else 0,
+            B_zp.stride(1) if B_zp is not None else 0,
+            block_k_diviable=A.shape[1] % config["BLOCK_SIZE_K"] == 0,
+            group_size=block_shape[1],
+            MUL_ROUTED_WEIGHT=mul_routed_weight,
+            top_k=top_k,
+            compute_type=compute_type,
+            has_zp=B_zp is not None,
+            use_int4_w4a16=use_int4_w4a16,
+            use_int8_w8a16=use_int8_w8a16,
+            **config,
+        )
+
+    else:
+        fused_moe_kernel[grid](
+            A,
+            B,
+            C,
+            A_scale,
+            B_scale,
+            topk_weights,
+            sorted_token_ids,
+            expert_ids,
+            num_tokens_post_padded,
+            B.shape[1],
+            A.shape[1],
+            EM,
+            topk_ids.numel(),
+            A.stride(0),
+            A.stride(1),
+            B.stride(0),
+            B.stride(2),
+            B.stride(1),
+            C.stride(1),
+            C.stride(2),
+            A_scale.stride(0)
+            if A_scale is not None and A_scale.ndim == 2 else 0,
+            A_scale.stride(1)
+            if A_scale is not None and A_scale.ndim == 2 else 0,
+            B_scale.stride(0)
+            if B_scale is not None and B_scale.ndim >= 2 else 0,
+            B_scale.stride(2)
+            if B_scale is not None and B_scale.ndim == 3 else 0,
+            B_scale.stride(1)
+            if B_scale is not None and B_scale.ndim >= 2 else 0,
+            0 if block_shape is None else block_shape[0],
+            0 if block_shape is None else block_shape[1],
+            MUL_ROUTED_WEIGHT=mul_routed_weight,
+            top_k=top_k,
+            compute_type=compute_type,
+            use_fp8_w8a8=use_fp8_w8a8,
+            use_int8_w8a16=use_int8_w8a16,
+            **config,
+        )
 
 
-def get_config_file_name(E: int, N: int, dtype: Optional[str]) -> str:
+# Adapted from: https://github.com/sgl-project/sglang/pull/2628
+def get_config_file_name(E: int,
+                         N: int,
+                         dtype: Optional[str],
+                         block_shape: Optional[List[int]] = None) -> str:
     device_name = current_platform.get_device_name().replace(" ", "_")
     dtype_selector = "" if not dtype else f",dtype={dtype}"
-    return f"E={E},N={N},device_name={device_name}{dtype_selector}.json"
+    block_shape_selector = ("" if not block_shape or not all(block_shape) else
+                            f",block_shape={block_shape}")
+    return f"E={E},N={N},device_name={device_name}{dtype_selector}{block_shape_selector}.json"  # noqa: E501
 
 
+# Adapted from: https://github.com/sgl-project/sglang/pull/2628
 @functools.lru_cache
-def get_moe_configs(E: int, N: int,
-                    dtype: Optional[str]) -> Optional[Dict[int, Any]]:
+def get_moe_configs(
+    E: int,
+    N: int,
+    dtype: Optional[str],
+    block_n: Optional[int] = None,
+    block_k: Optional[int] = None,
+) -> Optional[Dict[int, Any]]:
     """
     Return optimized configurations for the fused MoE kernel.
 
@@ -358,7 +789,8 @@ def get_moe_configs(E: int, N: int,
 
     # First look up if an optimized configuration is available in the configs
     # directory
-    json_file_name = get_config_file_name(E, N, dtype)
+    block_shape = [block_n, block_k] if block_n and block_k else None
+    json_file_name = get_config_file_name(E, N, dtype, block_shape)
 
     config_file_path = os.path.join(
         os.path.dirname(os.path.realpath(__file__)), "configs", json_file_name)
@@ -385,21 +817,34 @@ def get_default_config(
     topk: int,
     dtype: Optional[str],
     is_marlin: bool,
+    block_shape: Optional[List[int]] = None,
 ) -> Dict[str, int]:
-    config = {
-        'BLOCK_SIZE_M': 64,
-        'BLOCK_SIZE_N': 64,
-        'BLOCK_SIZE_K': 32,
-        'GROUP_SIZE_M': 8
-    }
-    # A heuristic: fused marlin works faster with this config for small M
-    if M <= E or (is_marlin and M <= 32):
+    if dtype == "fp8_w8a8" and block_shape is not None:
+        # Block-wise quant: BLOCK_SIZE_N must be divisible by block_shape[0]
+        # BLOCK_SIZE_K must be divisible by block_shape[1]
         config = {
-            'BLOCK_SIZE_M': 16,
-            'BLOCK_SIZE_N': 32,
-            'BLOCK_SIZE_K': 64,
-            'GROUP_SIZE_M': 1
+            "BLOCK_SIZE_M": 64,
+            "BLOCK_SIZE_N": block_shape[0],
+            "BLOCK_SIZE_K": block_shape[1],
+            "GROUP_SIZE_M": 32,
+            "num_warps": 4,
+            "num_stages": 3,
         }
+    else:
+        config = {
+            "BLOCK_SIZE_M": 64,
+            "BLOCK_SIZE_N": 64,
+            "BLOCK_SIZE_K": 32,
+            "GROUP_SIZE_M": 8,
+        }
+        # A heuristic: fused marlin works faster with this config for small M
+        if M <= E or (is_marlin and M <= 32):
+            config = {
+                "BLOCK_SIZE_M": 16,
+                "BLOCK_SIZE_N": 32,
+                "BLOCK_SIZE_K": 64,
+                "GROUP_SIZE_M": 1,
+            }
     return config
 
 
@@ -419,7 +864,9 @@ def try_get_optimal_moe_config(
     else:
         # First try to load optimal config from the file
         E, _, N = w2_shape
-        configs = get_moe_configs(E, N, dtype)
+        block_n = block_shape[0] if block_shape else 0
+        block_k = block_shape[1] if block_shape else 0
+        configs = get_moe_configs(E, N, dtype, block_n, block_k)
 
         if configs:
             # If an optimal configuration map has been found, look up the
@@ -428,13 +875,7 @@ def try_get_optimal_moe_config(
         else:
             # Else use the default config
             config = get_default_config(M, E, N, w1_shape[2], top_k, dtype,
-                                        is_marlin)
-    # NOTE: For block-wise quant,
-    # BLOCK_K must be divisible by block_shape[1]
-    # BLOCK_N and BLOCK_M has no requirements
-    if block_shape is not None:
-        config["BLOCK_SIZE_N"] = block_shape[0]
-        config["BLOCK_SIZE_K"] = block_shape[1]
+                                        is_marlin, block_shape)
     return config
 
 
@@ -477,6 +918,7 @@ def fused_topk(
 
 
 # This is used by the Deepseek-V2 and Deepseek-V3 model
+@torch.compile(dynamic=True, backend=current_platform.simple_compile_backend)
 def grouped_topk(hidden_states: torch.Tensor,
                  gating_output: torch.Tensor,
                  topk: int,
@@ -531,12 +973,15 @@ def grouped_topk(hidden_states: torch.Tensor,
 
 
 def get_config_dtype_str(dtype: torch.dtype,
+                         use_int4_w4a16: Optional[bool] = False,
                          use_int8_w8a16: Optional[bool] = False,
                          use_fp8_w8a8: Optional[bool] = False):
     if use_fp8_w8a8:
         return "fp8_w8a8"
     elif use_int8_w8a16:
         return "int8_w8a16"
+    elif use_int4_w4a16:
+        return "int4_w8a16"
     elif dtype == torch.float:
         # avoiding cases where kernel fails when float32 MoE
         # use fp16/bfloat16 configs
@@ -551,14 +996,17 @@ def inplace_fused_experts(hidden_states: torch.Tensor,
                           topk_ids: torch.Tensor,
                           use_fp8_w8a8: bool = False,
                           use_int8_w8a16: bool = False,
+                          use_int4_w4a16: bool = False,
                           w1_scale: Optional[torch.Tensor] = None,
                           w2_scale: Optional[torch.Tensor] = None,
+                          w1_zp: Optional[torch.Tensor] = None,
+                          w2_zp: Optional[torch.Tensor] = None,
                           a1_scale: Optional[torch.Tensor] = None,
                           a2_scale: Optional[torch.Tensor] = None,
                           block_shape: Optional[List[int]] = None) -> None:
     fused_experts_impl(hidden_states, w1, w2, topk_weights, topk_ids, True,
-                       use_fp8_w8a8, use_int8_w8a16, w1_scale, w2_scale,
-                       a1_scale, a2_scale, block_shape)
+                       use_fp8_w8a8, use_int8_w8a16, use_int4_w4a16, w1_scale,
+                       w2_scale, w1_zp, w2_zp, a1_scale, a2_scale, block_shape)
 
 
 def inplace_fused_experts_fake(
@@ -569,8 +1017,11 @@ def inplace_fused_experts_fake(
         topk_ids: torch.Tensor,
         use_fp8_w8a8: bool = False,
         use_int8_w8a16: bool = False,
+        use_int4_w4a16: bool = False,
         w1_scale: Optional[torch.Tensor] = None,
         w2_scale: Optional[torch.Tensor] = None,
+        w1_zp: Optional[torch.Tensor] = None,
+        w2_zp: Optional[torch.Tensor] = None,
         a1_scale: Optional[torch.Tensor] = None,
         a2_scale: Optional[torch.Tensor] = None,
         block_shape: Optional[List[int]] = None) -> None:
@@ -593,14 +1044,18 @@ def outplace_fused_experts(
         topk_ids: torch.Tensor,
         use_fp8_w8a8: bool = False,
         use_int8_w8a16: bool = False,
+        use_int4_w4a16: bool = False,
         w1_scale: Optional[torch.Tensor] = None,
         w2_scale: Optional[torch.Tensor] = None,
+        w1_zp: Optional[torch.Tensor] = None,
+        w2_zp: Optional[torch.Tensor] = None,
         a1_scale: Optional[torch.Tensor] = None,
         a2_scale: Optional[torch.Tensor] = None,
         block_shape: Optional[List[int]] = None) -> torch.Tensor:
     return fused_experts_impl(hidden_states, w1, w2, topk_weights, topk_ids,
-                              False, use_fp8_w8a8, use_int8_w8a16, w1_scale,
-                              w2_scale, a1_scale, a2_scale, block_shape)
+                              False, use_fp8_w8a8, use_int8_w8a16,
+                              use_int4_w4a16, w1_scale, w2_scale, w1_zp, w2_zp,
+                              a1_scale, a2_scale, block_shape)
 
 
 def outplace_fused_experts_fake(
@@ -611,8 +1066,11 @@ def outplace_fused_experts_fake(
         topk_ids: torch.Tensor,
         use_fp8_w8a8: bool = False,
         use_int8_w8a16: bool = False,
+        use_int4_w4a16: bool = False,
         w1_scale: Optional[torch.Tensor] = None,
         w2_scale: Optional[torch.Tensor] = None,
+        w1_zp: Optional[torch.Tensor] = None,
+        w2_zp: Optional[torch.Tensor] = None,
         a1_scale: Optional[torch.Tensor] = None,
         a2_scale: Optional[torch.Tensor] = None,
         block_shape: Optional[List[int]] = None) -> torch.Tensor:
@@ -635,8 +1093,11 @@ def fused_experts(hidden_states: torch.Tensor,
                   inplace: bool = False,
                   use_fp8_w8a8: bool = False,
                   use_int8_w8a16: bool = False,
+                  use_int4_w4a16: bool = False,
                   w1_scale: Optional[torch.Tensor] = None,
                   w2_scale: Optional[torch.Tensor] = None,
+                  w1_zp: Optional[torch.Tensor] = None,
+                  w2_zp: Optional[torch.Tensor] = None,
                   a1_scale: Optional[torch.Tensor] = None,
                   a2_scale: Optional[torch.Tensor] = None,
                   block_shape: Optional[List[int]] = None):
@@ -644,16 +1105,15 @@ def fused_experts(hidden_states: torch.Tensor,
         torch.ops.vllm.inplace_fused_experts(hidden_states, w1, w2,
                                              topk_weights, topk_ids,
                                              use_fp8_w8a8, use_int8_w8a16,
-                                             w1_scale, w2_scale, a1_scale,
+                                             use_int4_w4a16, w1_scale,
+                                             w2_scale, w1_zp, w2_zp, a1_scale,
                                              a2_scale, block_shape)
         return hidden_states
     else:
-        return torch.ops.vllm.outplace_fused_experts(hidden_states, w1, w2,
-                                                     topk_weights, topk_ids,
-                                                     use_fp8_w8a8,
-                                                     use_int8_w8a16, w1_scale,
-                                                     w2_scale, a1_scale,
-                                                     a2_scale, block_shape)
+        return torch.ops.vllm.outplace_fused_experts(
+            hidden_states, w1, w2, topk_weights, topk_ids, use_fp8_w8a8,
+            use_int8_w8a16, use_int4_w4a16, w1_scale, w2_scale, w1_zp, w2_zp,
+            a1_scale, a2_scale, block_shape)
 
 
 def fused_experts_impl(hidden_states: torch.Tensor,
@@ -664,13 +1124,21 @@ def fused_experts_impl(hidden_states: torch.Tensor,
                        inplace: bool = False,
                        use_fp8_w8a8: bool = False,
                        use_int8_w8a16: bool = False,
+                       use_int4_w4a16: bool = False,
                        w1_scale: Optional[torch.Tensor] = None,
                        w2_scale: Optional[torch.Tensor] = None,
+                       w1_zp: Optional[torch.Tensor] = None,
+                       w2_zp: Optional[torch.Tensor] = None,
                        a1_scale: Optional[torch.Tensor] = None,
                        a2_scale: Optional[torch.Tensor] = None,
                        block_shape: Optional[List[int]] = None):
     # Check constraints.
-    assert hidden_states.shape[1] == w1.shape[2], "Hidden size mismatch"
+    if use_int4_w4a16:
+        assert hidden_states.shape[1] // 2 == w1.shape[
+            2], "Hidden size mismatch"
+    else:
+        assert hidden_states.shape[1] == w1.shape[2], "Hidden size mismatch"
+
     assert topk_weights.shape == topk_ids.shape, "topk shape mismatch"
     assert hidden_states.is_contiguous(), "Hidden_states must be contiguous"
     assert w1.is_contiguous(), "Expert weights1 must be contiguous"
@@ -687,6 +1155,7 @@ def fused_experts_impl(hidden_states: torch.Tensor,
     M = min(num_tokens, CHUNK_SIZE)
     config_dtype = get_config_dtype_str(use_fp8_w8a8=use_fp8_w8a8,
                                         use_int8_w8a16=use_int8_w8a16,
+                                        use_int4_w4a16=use_int4_w4a16,
                                         dtype=hidden_states.dtype)
 
     get_config_func = functools.partial(
@@ -755,6 +1224,7 @@ def fused_experts_impl(hidden_states: torch.Tensor,
                                 intermediate_cache1,
                                 a1_scale,
                                 w1_scale,
+                                w1_zp,
                                 curr_topk_weights,
                                 curr_topk_ids,
                                 sorted_token_ids,
@@ -766,6 +1236,7 @@ def fused_experts_impl(hidden_states: torch.Tensor,
                                 compute_type=compute_type,
                                 use_fp8_w8a8=use_fp8_w8a8,
                                 use_int8_w8a16=use_int8_w8a16,
+                                use_int4_w4a16=use_int4_w4a16,
                                 block_shape=block_shape)
 
         torch.ops._C.silu_and_mul(intermediate_cache2,
@@ -776,6 +1247,7 @@ def fused_experts_impl(hidden_states: torch.Tensor,
                                 intermediate_cache3,
                                 a2_scale,
                                 w2_scale,
+                                w2_zp,
                                 curr_topk_weights,
                                 curr_topk_ids,
                                 sorted_token_ids,
@@ -787,6 +1259,7 @@ def fused_experts_impl(hidden_states: torch.Tensor,
                                 compute_type=compute_type,
                                 use_fp8_w8a8=use_fp8_w8a8,
                                 use_int8_w8a16=use_int8_w8a16,
+                                use_int4_w4a16=use_int4_w4a16,
                                 block_shape=block_shape)
 
         ops.moe_sum(intermediate_cache3.view(*intermediate_cache3.shape),
@@ -808,8 +1281,11 @@ def fused_moe(
     custom_routing_function: Optional[Callable] = None,
     use_fp8_w8a8: bool = False,
     use_int8_w8a16: bool = False,
+    use_int4_w4a16: bool = False,
     w1_scale: Optional[torch.Tensor] = None,
     w2_scale: Optional[torch.Tensor] = None,
+    w1_zp: Optional[torch.Tensor] = None,
+    w2_zp: Optional[torch.Tensor] = None,
     a1_scale: Optional[torch.Tensor] = None,
     a2_scale: Optional[torch.Tensor] = None,
     block_shape: Optional[List[int]] = None,
@@ -834,8 +1310,12 @@ def fused_moe(
         note: Deepseekv2 model uses grouped_topk
     - use_fp8_w8a8 (bool): If True, use fp8 arithmetic to compute the inner
         products for w1 and w2. Defaults to False.
-    - use_int8_w8a16 (bool): If True, use fp8 arithmetic to compute the inner
-        products for w1 and w2. Defaults to False.
+    - use_int8_w8a16 (bool): If True, use matmul of int8 weight and bf16/fp16
+        activation to compute the inner products for w1 and w2.
+        Defaults to False.
+    - use_int4_w4a16 (bool): If True, use matmul of int4 weight and bf16/fp16
+        activation to compute the inner products for w1 and w2.
+        Defaults to False.
     - w1_scale (Optional[torch.Tensor]): Optional scale to be used for
         w1.
     - w2_scale (Optional[torch.Tensor]): Optional scale to be used for
@@ -873,8 +1353,11 @@ def fused_moe(
                          inplace=inplace,
                          use_fp8_w8a8=use_fp8_w8a8,
                          use_int8_w8a16=use_int8_w8a16,
+                         use_int4_w4a16=use_int4_w4a16,
                          w1_scale=w1_scale,
                          w2_scale=w2_scale,
+                         w1_zp=w1_zp,
+                         w2_zp=w2_zp,
                          a1_scale=a1_scale,
                          a2_scale=a2_scale,
                          block_shape=block_shape)
diff --git a/vllm/model_executor/layers/fused_moe/layer.py b/vllm/model_executor/layers/fused_moe/layer.py
index da0ce1885dbb2..3c7ef5e0080ff 100644
--- a/vllm/model_executor/layers/fused_moe/layer.py
+++ b/vllm/model_executor/layers/fused_moe/layer.py
@@ -1,3 +1,5 @@
+# SPDX-License-Identifier: Apache-2.0
+
 from abc import abstractmethod
 from enum import Enum
 from typing import Callable, List, Optional, Tuple
diff --git a/vllm/model_executor/layers/fused_moe/moe_pallas.py b/vllm/model_executor/layers/fused_moe/moe_pallas.py
index 563ee18c64304..0365afa10a459 100644
--- a/vllm/model_executor/layers/fused_moe/moe_pallas.py
+++ b/vllm/model_executor/layers/fused_moe/moe_pallas.py
@@ -1,3 +1,5 @@
+# SPDX-License-Identifier: Apache-2.0
+
 import torch
 import torch.nn.functional as F
 from torch_xla.experimental.custom_kernel import _histogram
diff --git a/vllm/model_executor/layers/fused_moe/moe_torch_iterative.py b/vllm/model_executor/layers/fused_moe/moe_torch_iterative.py
index bcff55f4fdf16..d9a5de1b3033a 100644
--- a/vllm/model_executor/layers/fused_moe/moe_torch_iterative.py
+++ b/vllm/model_executor/layers/fused_moe/moe_torch_iterative.py
@@ -1,3 +1,5 @@
+# SPDX-License-Identifier: Apache-2.0
+
 import torch
 import torch.nn.functional as F
 
diff --git a/vllm/model_executor/layers/layernorm.py b/vllm/model_executor/layers/layernorm.py
index 43ea4eb5a4d1a..b476fb0dbc7eb 100644
--- a/vllm/model_executor/layers/layernorm.py
+++ b/vllm/model_executor/layers/layernorm.py
@@ -1,3 +1,4 @@
+# SPDX-License-Identifier: Apache-2.0
 """Custom normalization layers."""
 from typing import Optional, Tuple, Union
 
diff --git a/vllm/model_executor/layers/linear.py b/vllm/model_executor/layers/linear.py
index 52263e96fb9f9..08f1e103e53b7 100644
--- a/vllm/model_executor/layers/linear.py
+++ b/vllm/model_executor/layers/linear.py
@@ -1,3 +1,5 @@
+# SPDX-License-Identifier: Apache-2.0
+
 import itertools
 from abc import abstractmethod
 from typing import Dict, List, Optional, Tuple
diff --git a/vllm/model_executor/layers/logits_processor.py b/vllm/model_executor/layers/logits_processor.py
index 42decde1d0f79..ebf74c67d64cd 100644
--- a/vllm/model_executor/layers/logits_processor.py
+++ b/vllm/model_executor/layers/logits_processor.py
@@ -1,3 +1,4 @@
+# SPDX-License-Identifier: Apache-2.0
 """A layer that compute logits from hidden_stats."""
 import inspect
 from typing import Optional
diff --git a/vllm/model_executor/layers/mamba/mamba_mixer.py b/vllm/model_executor/layers/mamba/mamba_mixer.py
index 606c796d503cf..93c3cc91bb092 100644
--- a/vllm/model_executor/layers/mamba/mamba_mixer.py
+++ b/vllm/model_executor/layers/mamba/mamba_mixer.py
@@ -1,3 +1,5 @@
+# SPDX-License-Identifier: Apache-2.0
+
 import torch
 from torch import nn
 from torch.nn.parameter import Parameter
diff --git a/vllm/model_executor/layers/mamba/ops/causal_conv1d.py b/vllm/model_executor/layers/mamba/ops/causal_conv1d.py
index be5639df985fa..21e27160f090b 100644
--- a/vllm/model_executor/layers/mamba/ops/causal_conv1d.py
+++ b/vllm/model_executor/layers/mamba/ops/causal_conv1d.py
@@ -1,3 +1,5 @@
+# SPDX-License-Identifier: Apache-2.0
+
 # Copyright (c) 2024, Tri Dao.
 # Adapted from https://github.com/Dao-AILab/causal-conv1d/blob/main/causal_conv1d/causal_conv1d_interface.py
 
diff --git a/vllm/model_executor/layers/mamba/ops/mamba_ssm.py b/vllm/model_executor/layers/mamba/ops/mamba_ssm.py
index 1484b79815ab9..3c35f1ac0dcf5 100644
--- a/vllm/model_executor/layers/mamba/ops/mamba_ssm.py
+++ b/vllm/model_executor/layers/mamba/ops/mamba_ssm.py
@@ -1,3 +1,5 @@
+# SPDX-License-Identifier: Apache-2.0
+
 # Copyright (c) 2024, Tri Dao, Albert Gu.
 # Adapted from https://github.com/state-spaces/mamba/blob/main/mamba_ssm/ops/triton/selective_state_update.py
 
diff --git a/vllm/model_executor/layers/pooler.py b/vllm/model_executor/layers/pooler.py
index 75bf33dc70a51..0012636ef9ffc 100644
--- a/vllm/model_executor/layers/pooler.py
+++ b/vllm/model_executor/layers/pooler.py
@@ -1,3 +1,5 @@
+# SPDX-License-Identifier: Apache-2.0
+
 from enum import IntEnum
 from typing import List, Optional, Union
 
diff --git a/vllm/model_executor/layers/quantization/__init__.py b/vllm/model_executor/layers/quantization/__init__.py
index d2bde13fcf546..6ded3874fc1dd 100644
--- a/vllm/model_executor/layers/quantization/__init__.py
+++ b/vllm/model_executor/layers/quantization/__init__.py
@@ -1,3 +1,5 @@
+# SPDX-License-Identifier: Apache-2.0
+
 from typing import Dict, List, Type
 
 from vllm.model_executor.layers.quantization.base_config import (
@@ -26,7 +28,8 @@ QUANTIZATION_METHODS: List[str] = [
     "experts_int8",
     "neuron_quant",
     "ipex",
-    "quark"
+    "quark",
+    "moe_wna16"
 ]
 
 # The customized quantization methods which will be added to this dict.
@@ -94,6 +97,7 @@ def get_quantization_config(quantization: str) -> Type[QuantizationConfig]:
     from .ipex_quant import IPEXConfig
     from .marlin import MarlinConfig
     from .modelopt import ModelOptFp8Config
+    from .moe_wna16 import MoeWNA16Config
     from .neuron_quant import NeuronQuantConfig
     from .qqq import QQQConfig
     from .tpu_int8 import Int8TpuConfig
@@ -121,7 +125,8 @@ def get_quantization_config(quantization: str) -> Type[QuantizationConfig]:
         "experts_int8": ExpertsInt8Config,
         "neuron_quant": NeuronQuantConfig,
         "ipex": IPEXConfig,
-        "quark": QuarkConfig
+        "quark": QuarkConfig,
+        "moe_wna16": MoeWNA16Config,
     }
     # Update the `method_to_config` with customized quantization methods.
     method_to_config.update(_CUSTOMIZED_METHOD_TO_QUANT_CONFIG)
diff --git a/vllm/model_executor/layers/quantization/aqlm.py b/vllm/model_executor/layers/quantization/aqlm.py
index 72c89fe2b0e48..6c08d016c0f7b 100644
--- a/vllm/model_executor/layers/quantization/aqlm.py
+++ b/vllm/model_executor/layers/quantization/aqlm.py
@@ -1,3 +1,5 @@
+# SPDX-License-Identifier: Apache-2.0
+
 # Supports AQLM compression, see https://github.com/Vahe1994/AQLM
 # and https://arxiv.org/pdf/2401.06118.pdf
 
diff --git a/vllm/model_executor/layers/quantization/awq.py b/vllm/model_executor/layers/quantization/awq.py
index d83528e9ec79c..ff77af44d7707 100644
--- a/vllm/model_executor/layers/quantization/awq.py
+++ b/vllm/model_executor/layers/quantization/awq.py
@@ -1,3 +1,5 @@
+# SPDX-License-Identifier: Apache-2.0
+
 from typing import Any, Dict, List, Optional
 
 import torch
diff --git a/vllm/model_executor/layers/quantization/awq_marlin.py b/vllm/model_executor/layers/quantization/awq_marlin.py
index 0c3c9816878e9..8849ba2928228 100644
--- a/vllm/model_executor/layers/quantization/awq_marlin.py
+++ b/vllm/model_executor/layers/quantization/awq_marlin.py
@@ -1,3 +1,5 @@
+# SPDX-License-Identifier: Apache-2.0
+
 from typing import Any, Callable, Dict, List, Optional
 
 import torch
diff --git a/vllm/model_executor/layers/quantization/awq_triton.py b/vllm/model_executor/layers/quantization/awq_triton.py
index ace8f4a348812..09efd4dbd7975 100644
--- a/vllm/model_executor/layers/quantization/awq_triton.py
+++ b/vllm/model_executor/layers/quantization/awq_triton.py
@@ -1,3 +1,5 @@
+# SPDX-License-Identifier: Apache-2.0
+
 import torch
 import triton
 import triton.language as tl
diff --git a/vllm/model_executor/layers/quantization/base_config.py b/vllm/model_executor/layers/quantization/base_config.py
index 2fb2642dd5156..2eefcc4f30516 100644
--- a/vllm/model_executor/layers/quantization/base_config.py
+++ b/vllm/model_executor/layers/quantization/base_config.py
@@ -1,3 +1,5 @@
+# SPDX-License-Identifier: Apache-2.0
+
 import inspect
 from abc import ABC, abstractmethod
 from typing import Any, Dict, List, Optional, Type
diff --git a/vllm/model_executor/layers/quantization/bitsandbytes.py b/vllm/model_executor/layers/quantization/bitsandbytes.py
index 5dc872933282c..889eda009df15 100644
--- a/vllm/model_executor/layers/quantization/bitsandbytes.py
+++ b/vllm/model_executor/layers/quantization/bitsandbytes.py
@@ -1,3 +1,5 @@
+# SPDX-License-Identifier: Apache-2.0
+
 from typing import Any, Dict, List, Optional
 
 import torch
diff --git a/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors.py b/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors.py
index b2fc2360f47f1..24f7542e12385 100644
--- a/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors.py
+++ b/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors.py
@@ -1,4 +1,7 @@
-from typing import Any, Dict, List, Literal, Optional, cast
+# SPDX-License-Identifier: Apache-2.0
+
+from contextlib import suppress
+from typing import Any, Dict, List, Literal, Optional, Tuple, cast
 
 import torch
 from compressed_tensors.config import (CompressionFormat,
@@ -9,6 +12,7 @@ from compressed_tensors.quantization import (QuantizationArgs,
                                              QuantizationType)
 from pydantic import BaseModel
 
+from vllm.logger import init_logger
 from vllm.model_executor.layers.fused_moe import FusedMoE
 from vllm.model_executor.layers.linear import (LinearBase, LinearMethodBase,
                                                UnquantizedLinearMethod)
@@ -27,6 +31,8 @@ from vllm.model_executor.layers.quantization.compressed_tensors.utils import (
 from vllm.model_executor.layers.quantization.kv_cache import BaseKVCacheMethod
 from vllm.platforms import current_platform
 
+logger = init_logger(__name__)
+
 __all__ = ["CompressedTensorsLinearMethod"]
 
 SPARSITY_CONFIG_NAME: Literal["sparsity_config"] = "sparsity_config"
@@ -41,6 +47,7 @@ class CompressedTensorsConfig(QuantizationConfig):
         ignore: List[str],
         quant_format: str,
         sparsity_scheme_map: Dict[str, SparsityCompressionConfig],
+        sparsity_ignore_list: List[str],
         kv_cache_scheme: Optional[Dict[str, Any]] = None,
         config: Optional[Dict[str, Any]] = None,
     ):
@@ -51,6 +58,7 @@ class CompressedTensorsConfig(QuantizationConfig):
         self.target_scheme_map = target_scheme_map
         self.kv_cache_scheme = kv_cache_scheme
         self.sparsity_scheme_map = sparsity_scheme_map
+        self.sparsity_ignore_list = sparsity_ignore_list
         self.config = config
 
     def get_linear_method(self) -> "CompressedTensorsLinearMethod":
@@ -79,6 +87,8 @@ class CompressedTensorsConfig(QuantizationConfig):
             return UnquantizedLinearMethod()
         if isinstance(layer, LinearBase):
             scheme = self.get_scheme(layer=layer, layer_name=prefix)
+            if scheme is None:
+                return UnquantizedLinearMethod()
             layer.scheme = scheme
             return CompressedTensorsLinearMethod(self)
         if isinstance(layer, Attention):
@@ -93,7 +103,7 @@ class CompressedTensorsConfig(QuantizationConfig):
         quant_format = cast(str, config.get("format"))
         target_scheme_map = cls._quantization_scheme_map_from_config(
             config=config)
-        sparsity_scheme_map = cls._sparsity_scheme_map_from_config(
+        sparsity_scheme_map, sparsity_ignore_list = cls._parse_sparsity_config(
             config=config)
 
         return cls(
@@ -101,20 +111,23 @@ class CompressedTensorsConfig(QuantizationConfig):
             ignore=ignore,
             quant_format=quant_format,
             sparsity_scheme_map=sparsity_scheme_map,
+            sparsity_ignore_list=sparsity_ignore_list,
             config=config,
         )
 
     @classmethod
-    def _sparsity_scheme_map_from_config(
-            cls, config: Dict[str,
-                              Any]) -> Dict[str, SparsityCompressionConfig]:
+    def _parse_sparsity_config(
+        cls, config: Dict[str, Any]
+    ) -> Tuple[Dict[str, SparsityCompressionConfig], List[str]]:
         """
         :param config: The `quantization_config` dictionary from config.json
-        :return: A dictionary mapping target layer names to their corresponding
-            sparsity compression configurations
+        :return: A tuple with two elements
+            1. A dictionary mapping target layer names to their corresponding
+                sparsity_config
+            2. A list of layer names to ignore for sparsity
         """
         if not (sparsity_config := config.get(SPARSITY_CONFIG_NAME)):
-            return dict()
+            return dict(), []
 
         sparsity_config = SparsityCompressionConfig.model_validate(
             sparsity_config)
@@ -122,7 +135,8 @@ class CompressedTensorsConfig(QuantizationConfig):
             target: sparsity_config
             for target in sparsity_config.targets or list()
         }
-        return sparse_scheme_map
+        sparsity_ignore_list = sparsity_config.ignore or list()
+        return sparse_scheme_map, sparsity_ignore_list
 
     @classmethod
     def _quantization_scheme_map_from_config(
@@ -340,23 +354,19 @@ class CompressedTensorsConfig(QuantizationConfig):
         raise NotImplementedError(
             "No compressed-tensors compatible scheme was found.")
 
-    def get_scheme(
-            self,
-            layer: torch.nn.Module,
-            layer_name: Optional[str] = None) -> "CompressedTensorsScheme":
+    def get_scheme(self,
+                   layer: torch.nn.Module,
+                   layer_name: Optional[str] = None
+                   ) -> Optional["CompressedTensorsScheme"]:
         """
         compressed-tensors supports non uniform in the following way:
 
-        ignore: List of layer_names or nn.Module names to be ignored.
         targets of config_groups: There can be N config_groups which each
             have a quantization scheme. Each config_group has a list of targets
             which can be a full layer_name, a regex for a layer_name, or
             an nn.Module name.
 
-        We first check whether a layer is in the ignore group and use
-        CompressedTensorsUnquantized (i.e. fp16/bf16) scheme for the layer
-
-        We then detect whether a layer_name is found in any target and
+        Detect whether a layer_name is found in any target and
         use the quantization scheme corresponding to the matched target
         to select the CompressedTensorsScheme used for infernece.
         """
@@ -368,6 +378,8 @@ class CompressedTensorsConfig(QuantizationConfig):
         # need to make accelerate optional in ct to do this
 
         # Will be empty for models with only sparsity
+        weight_quant = input_quant = None
+        sparsity_scheme: Optional[SparsityCompressionConfig] = None
         if self.target_scheme_map:
             matched_target = find_matched_target(
                 layer_name=layer_name,
@@ -377,23 +389,35 @@ class CompressedTensorsConfig(QuantizationConfig):
             scheme_dict = self.target_scheme_map[matched_target]
             weight_quant = scheme_dict.get("weights")
             input_quant = scheme_dict.get("input_activations")
-        elif self.sparsity_scheme_map:
-            matched_target = find_matched_target(
-                layer_name=layer_name,
-                module=layer,
-                targets=self.sparsity_scheme_map.keys())
-            weight_quant = None
-            input_quant = None
 
-        # For models with sparsity, assumes that the sparse layers are also
-        # quantized for cutlass 2:4 support
-        sparsity_scheme: Optional[
-            SparsityCompressionConfig] = self.sparsity_scheme_map.get(
-                matched_target)
+        if self.sparsity_scheme_map:
+            is_ignored = False
+            with suppress(ValueError):
+                is_ignored = find_matched_target(
+                    layer_name=layer_name,
+                    module=layer,
+                    targets=self.sparsity_ignore_list)
+
+            # if the layer is in the sparsity ignore list,
+            # we should not apply any sparsity scheme
+
+            if not is_ignored:
+                matched_target = find_matched_target(
+                    layer_name=layer_name,
+                    module=layer,
+                    targets=self.sparsity_scheme_map.keys())
+                sparsity_scheme = self.sparsity_scheme_map.get(matched_target)
 
         if self.supports_cutlass_24(weight_quant=weight_quant,
                                     input_quant=input_quant,
                                     sparsity_scheme=sparsity_scheme):
+            # FIXME(tlrmchlsmth): layers using W16A16 CUTLASS 2:4 sparse kernels
+            # currently produce bad output in some cases
+            if weight_quant is None:
+                logger.warning_once(
+                    "CompressedTensors24 scheme is disabled for the w16a16 "
+                    "case. Falling back to UnquantizedLinearMethod")
+                return None
             # Have a valid sparsity scheme
             # Validate layer is supported by Cutlass 2:4 Kernel
             scheme = CompressedTensors24(quantized=weight_quant is not None
@@ -410,6 +434,8 @@ class CompressedTensorsConfig(QuantizationConfig):
         # Raise error if device does not support the scheme
         # (e.g. fp8 needs ada lovelace)
         self._check_scheme_supported(scheme.get_min_capability())
+        logger.debug("Using scheme: %s for %s", scheme.__class__.__name__,
+                     layer_name)
         return scheme
 
     def get_cache_scale(self, name: str) -> Optional[str]:
diff --git a/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors_moe.py b/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors_moe.py
index e1c45f4e42e41..db8e8a4b6c11a 100644
--- a/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors_moe.py
+++ b/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors_moe.py
@@ -1,3 +1,5 @@
+# SPDX-License-Identifier: Apache-2.0
+
 import enum
 from enum import Enum
 from typing import Callable, List, Optional
diff --git a/vllm/model_executor/layers/quantization/compressed_tensors/schemes/__init__.py b/vllm/model_executor/layers/quantization/compressed_tensors/schemes/__init__.py
index 569ecaa6f5a76..b26c74f2484b6 100644
--- a/vllm/model_executor/layers/quantization/compressed_tensors/schemes/__init__.py
+++ b/vllm/model_executor/layers/quantization/compressed_tensors/schemes/__init__.py
@@ -1,3 +1,5 @@
+# SPDX-License-Identifier: Apache-2.0
+
 from .compressed_tensors_scheme import CompressedTensorsScheme
 from .compressed_tensors_w4a16_24 import (W4A16SPARSE24_SUPPORTED_BITS,
                                           CompressedTensorsW4A16Sparse24)
diff --git a/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_24.py b/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_24.py
index 21e6fe7a22616..84f924b236af9 100644
--- a/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_24.py
+++ b/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_24.py
@@ -1,3 +1,5 @@
+# SPDX-License-Identifier: Apache-2.0
+
 from typing import Callable, List, Optional
 
 import torch
diff --git a/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_scheme.py b/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_scheme.py
index b4bab33e1fb1d..daa25d23a3060 100644
--- a/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_scheme.py
+++ b/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_scheme.py
@@ -1,3 +1,5 @@
+# SPDX-License-Identifier: Apache-2.0
+
 from abc import ABC, abstractmethod
 from typing import Optional
 
diff --git a/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w4a16_24.py b/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w4a16_24.py
index 2e1b5e3c2d3b1..535ea6b32cfbf 100644
--- a/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w4a16_24.py
+++ b/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w4a16_24.py
@@ -1,3 +1,5 @@
+# SPDX-License-Identifier: Apache-2.0
+
 from typing import Callable, List, Optional
 
 import torch
diff --git a/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w8a16_fp8.py b/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w8a16_fp8.py
index 1671a23d77c63..5c8261908735f 100644
--- a/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w8a16_fp8.py
+++ b/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w8a16_fp8.py
@@ -1,3 +1,5 @@
+# SPDX-License-Identifier: Apache-2.0
+
 from typing import Callable, List, Optional
 
 import torch
diff --git a/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w8a8_fp8.py b/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w8a8_fp8.py
index 1d4e4bd52adaa..5dcc41a9e5dab 100644
--- a/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w8a8_fp8.py
+++ b/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w8a8_fp8.py
@@ -1,3 +1,5 @@
+# SPDX-License-Identifier: Apache-2.0
+
 from typing import Callable, List, Optional
 
 import torch
diff --git a/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w8a8_int8.py b/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w8a8_int8.py
index 0e3f4731775c5..08d86a4e5ddd2 100644
--- a/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w8a8_int8.py
+++ b/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w8a8_int8.py
@@ -1,3 +1,5 @@
+# SPDX-License-Identifier: Apache-2.0
+
 from typing import Callable, List, Optional, Set
 
 import torch
diff --git a/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_wNa16.py b/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_wNa16.py
index 2dd243b9c3109..38df09ff39373 100644
--- a/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_wNa16.py
+++ b/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_wNa16.py
@@ -1,3 +1,5 @@
+# SPDX-License-Identifier: Apache-2.0
+
 from typing import Callable, List, Optional, Set
 
 import torch
diff --git a/vllm/model_executor/layers/quantization/compressed_tensors/triton_scaled_mm.py b/vllm/model_executor/layers/quantization/compressed_tensors/triton_scaled_mm.py
index f4c1dbc0361c6..b69c5e7a02a72 100644
--- a/vllm/model_executor/layers/quantization/compressed_tensors/triton_scaled_mm.py
+++ b/vllm/model_executor/layers/quantization/compressed_tensors/triton_scaled_mm.py
@@ -1,3 +1,5 @@
+# SPDX-License-Identifier: Apache-2.0
+
 from typing import Optional, Type
 
 import torch
diff --git a/vllm/model_executor/layers/quantization/compressed_tensors/utils.py b/vllm/model_executor/layers/quantization/compressed_tensors/utils.py
index 8fcbda377428e..4ea79531efe7e 100644
--- a/vllm/model_executor/layers/quantization/compressed_tensors/utils.py
+++ b/vllm/model_executor/layers/quantization/compressed_tensors/utils.py
@@ -1,3 +1,5 @@
+# SPDX-License-Identifier: Apache-2.0
+
 import re
 from typing import Iterable, Optional
 
@@ -12,7 +14,7 @@ def is_activation_quantization_format(format: str) -> bool:
     _ACTIVATION_QUANTIZATION_FORMATS = [
         CompressionFormat.naive_quantized.value,
         CompressionFormat.int_quantized.value,
-        CompressionFormat.float_quantized.value
+        CompressionFormat.float_quantized.value,
     ]
     return format in _ACTIVATION_QUANTIZATION_FORMATS
 
@@ -68,7 +70,7 @@ def should_ignore_layer(layer_name: Optional[str],
 def check_equal_or_regex_match(layer_name: str,
                                targets: Iterable[str]) -> bool:
     """
-    Checks whether a layer_name is exactly equal or a regex match for 
+    Checks whether a layer_name is exactly equal or a regex match for
     if target starts with 're:' to any target in list.
     """
     for target in targets:
@@ -77,17 +79,64 @@ def check_equal_or_regex_match(layer_name: str,
     return False
 
 
+def _handle_fused_layers(func):
+    """
+    Decorator to handle fused layers by mapping vllm fused layer names
+    to their corresponding unfused layer names for quantization/pruning schemes.
+    """
+    # fused_layer_name -> unfused_layer_name
+    fused_layer_map = {
+        "qkv_proj": "q_proj",
+        "gate_up_proj": "up_proj",
+    }
+
+    def fused_layer_handler(layer_name: Optional[str], module: Module,
+                            targets: Iterable[str]) -> Optional[str]:
+        """
+        Wrapper function specifically designed to support the
+        find_matched_target function.
+
+        It handles cases where the provided layer name corresponds to a
+        fused layer in vllm, mapping it to its equivalent unfused layer name
+        based on the predefined fused_layer_map. If the original layer name
+        raises a ValueError in the wrapped function, this handler
+        will attempt to resolve the issue by substituting with unfused
+        layer name.
+
+        :param layer_name: Name of the layer, which may be fused.
+        :param module: An instance of torch.nn.Module.
+        :param targets: A list of target names or patterns to match.
+        :return: The result of the wrapped find_matched_target function with
+            the resolved layer name.
+        :raises ValueError: If the layer name cannot be resolved to a 
+            valid target.
+        """
+        try:
+            return func(layer_name, module, targets)
+        except ValueError:
+            if layer_name is None:
+                layer_name = ""
+            parent_name, fused_proj_name = layer_name.rsplit(".", 1)
+            unfused_proj_name = fused_layer_map.get(fused_proj_name,
+                                                    fused_proj_name)
+            new_layer_name = f"{parent_name}.{unfused_proj_name}"
+            return func(new_layer_name, module, targets)
+
+    return fused_layer_handler
+
+
+@_handle_fused_layers
 def find_matched_target(layer_name: Optional[str], module: Module,
                         targets: Iterable[str]) -> str:
     """
     Helper function to look up which "target" in the compressed-tensors
     config that a layer corresponds to.
 
-    Recall that a compressed-tensors configs has a concept of 
+    Recall that a compressed-tensors configs has a concept of
     config_groups, where each layer can be quantized with with a different
     scheme.
 
-    targets in each config_group will be a list of either layer names 
+    targets in each config_group will be a list of either layer names
     (or regexes corresponding to layer names) or names of torch Modules.
 
     First, we try to match the layer_name with a target
@@ -103,11 +152,13 @@ def find_matched_target(layer_name: Optional[str], module: Module,
 
     matched_target = (_find_first_match(layer_name, targets)
                       or _find_first_match(module.__class__.__name__, targets,
-                                           True))
+                                           True)
+                      or _match_fused_layer(layer_name, targets))
 
     if matched_target is None:
-        raise ValueError(f"Unable to find matching target for {module} in the "
-                         "compressed-tensors config.")
+        raise ValueError(
+            f"Unable to find matching target for {layer_name} in the "
+            "compressed-tensors config.")
 
     return matched_target
 
@@ -152,3 +203,41 @@ def _is_equal_or_regex_match(value: str,
     elif target == value:
         return True
     return False
+
+
+def _match_fused_layer(layer_name: str,
+                       target_layers: Iterable[str]) -> Optional[str]:
+    """
+    Match a fused layer name to its corresponding individual layer in 
+    target_layers.
+
+    Examples:
+        layer_name = "model.layers.0.self_attn.qkv_proj"
+        target_layers = ["model.layers.0.self_attn.q_proj",
+                        "model.layers.0.self_attn.k_proj",
+                        "model.layers.0.self_attn.v_proj"]
+    """
+    # Split into parent path and layer type
+    # e.g., "model.layers.0.self_attn" and "qkv_proj"
+    parent_path = ".".join(layer_name.split(".")[:-1])
+    layer_type = layer_name.split(".")[-1]
+
+    if layer_type not in FUSED_LAYER_NAME_MAPPING:
+        return None
+
+    possible_layer_types = FUSED_LAYER_NAME_MAPPING[layer_type]
+
+    # Look for a target layer that:
+    # 1. Has the same parent path
+    # 2. Ends with one of the possible individual layer types
+    for target in target_layers:
+        is_same_parent = parent_path in target
+        is_matching_type = any(type_suffix in target
+                               for type_suffix in possible_layer_types)
+
+        if is_same_parent and is_matching_type and all(
+            (f"{parent_path}.{type_suffix}" in target_layers)
+                for type_suffix in possible_layer_types):
+            return target
+
+    return None
diff --git a/vllm/model_executor/layers/quantization/deepspeedfp.py b/vllm/model_executor/layers/quantization/deepspeedfp.py
index 36598b3e2990f..b4123650149f0 100644
--- a/vllm/model_executor/layers/quantization/deepspeedfp.py
+++ b/vllm/model_executor/layers/quantization/deepspeedfp.py
@@ -1,3 +1,5 @@
+# SPDX-License-Identifier: Apache-2.0
+
 from typing import Any, Dict, List, Optional
 
 import torch
diff --git a/vllm/model_executor/layers/quantization/experts_int8.py b/vllm/model_executor/layers/quantization/experts_int8.py
index 100cbfa4c9598..87fbcf62ac1ed 100644
--- a/vllm/model_executor/layers/quantization/experts_int8.py
+++ b/vllm/model_executor/layers/quantization/experts_int8.py
@@ -1,3 +1,5 @@
+# SPDX-License-Identifier: Apache-2.0
+
 from typing import Any, Callable, Dict, List, Optional
 
 import torch
diff --git a/vllm/model_executor/layers/quantization/fbgemm_fp8.py b/vllm/model_executor/layers/quantization/fbgemm_fp8.py
index 7b71e13b50ccc..da5ef36c51054 100644
--- a/vllm/model_executor/layers/quantization/fbgemm_fp8.py
+++ b/vllm/model_executor/layers/quantization/fbgemm_fp8.py
@@ -1,3 +1,5 @@
+# SPDX-License-Identifier: Apache-2.0
+
 from typing import Any, Dict, List, Optional
 
 import torch
diff --git a/vllm/model_executor/layers/quantization/fp8.py b/vllm/model_executor/layers/quantization/fp8.py
index 21d4355b36ab0..86e025310f4ef 100644
--- a/vllm/model_executor/layers/quantization/fp8.py
+++ b/vllm/model_executor/layers/quantization/fp8.py
@@ -1,3 +1,5 @@
+# SPDX-License-Identifier: Apache-2.0
+
 from typing import Any, Callable, Dict, List, Optional
 
 import torch
@@ -21,7 +23,8 @@ from vllm.model_executor.layers.quantization.utils.quant_utils import (
     is_layer_skipped)
 from vllm.model_executor.layers.quantization.utils.w8a8_utils import (
     all_close_1d, apply_fp8_linear, convert_to_channelwise,
-    cutlass_fp8_supported, normalize_e4m3fn_to_e4m3fnuz, per_tensor_dequantize,
+    cutlass_block_fp8_supported, cutlass_fp8_supported,
+    normalize_e4m3fn_to_e4m3fnuz, per_tensor_dequantize,
     requantize_with_max_scale)
 from vllm.model_executor.parameter import (BlockQuantScaleParameter,
                                            ModelWeightParameter,
@@ -133,6 +136,7 @@ class Fp8LinearMethod(LinearMethodBase):
     def __init__(self, quant_config: Fp8Config):
         self.quant_config = quant_config
         self.cutlass_fp8_supported = cutlass_fp8_supported()
+        self.cutlass_block_fp8_supported = cutlass_block_fp8_supported()
 
         # For GPUs that lack FP8 hardware support, we can leverage the Marlin
         # kernel for fast weight-only FP8 quantization
@@ -245,20 +249,24 @@ class Fp8LinearMethod(LinearMethodBase):
                 layer.register_parameter("input_scale", None)
 
     def process_weights_after_loading(self, layer: Module) -> None:
-        # Block quant doesn't need to process weights after loading
+        # TODO(rob): refactor block quant into separate class.
         if self.block_quant:
+            assert self.quant_config.activation_scheme == "dynamic"
             if current_platform.is_rocm():
-                weight, weight_scale, _ = \
+                weight, weight_scale_inv, _ = \
                     normalize_e4m3fn_to_e4m3fnuz(
                         weight=layer.weight,
-                        weight_scale=layer.weight_scale_inv,
-                        input_scale=layer.input_scale)
-                layer.weight = Parameter(weight, requires_grad=False)
-                layer.weight_scale_inv = Parameter(weight_scale,
-                                                   requires_grad=False)
+                        weight_scale=layer.weight_scale_inv)
+            else:
+                weight = layer.weight.data
+                weight_scale_inv = layer.weight_scale_inv.data
+
+            # Torch.compile cannot use Parameter subclasses.
+            layer.weight = Parameter(weight, requires_grad=False)
+            layer.weight_scale_inv = Parameter(weight_scale_inv,
+                                               requires_grad=False)
             return
-        layer.weight = torch.nn.Parameter(layer.weight.data,
-                                          requires_grad=False)
+
         # If checkpoint not serialized fp8, quantize the weights.
         if not self.quant_config.is_checkpoint_fp8_serialized:
             qweight, weight_scale = ops.scaled_fp8_quant(layer.weight,
@@ -355,6 +363,7 @@ class Fp8LinearMethod(LinearMethodBase):
                 weight_scale=layer.weight_scale_inv,
                 input_scale=layer.input_scale,
                 bias=bias,
+                cutlass_block_fp8_supported=self.cutlass_block_fp8_supported,
             )
 
         return apply_fp8_linear(
@@ -507,8 +516,9 @@ class Fp8MoEMethod(FusedMoEMethodBase):
             layer.w2_input_scale = None
 
     def process_weights_after_loading(self, layer: Module) -> None:
-        # Block quant doesn't need to process weights after loading
+        # TODO (rob): refactor block quant into separate class.
         if self.block_quant:
+            assert self.quant_config.activation_scheme == "dynamic"
             if current_platform.is_rocm():
                 w13_weight, w13_weight_scale_inv, w13_input_scale = \
                     normalize_e4m3fn_to_e4m3fnuz(
@@ -518,22 +528,21 @@ class Fp8MoEMethod(FusedMoEMethodBase):
                     normalize_e4m3fn_to_e4m3fnuz(
                         layer.w2_weight, layer.w2_weight_scale_inv,
                         layer.w2_input_scale)
-                # Reset the parameter
-                layer.w13_weight = torch.nn.Parameter(w13_weight,
-                                                      requires_grad=False)
-                layer.w13_weight_scale_inv = torch.nn.Parameter(
-                    w13_weight_scale_inv, requires_grad=False)
-                if w13_input_scale is not None:
-                    layer.w13_input_scale = torch.nn.Parameter(
-                        w13_input_scale, requires_grad=False)
-                layer.w2_weight = torch.nn.Parameter(w2_weight,
-                                                     requires_grad=False)
-                layer.w2_weight_scale_inv = torch.nn.Parameter(
-                    w2_weight_scale_inv, requires_grad=False)
-                if w2_input_scale is not None:
-                    layer.w2_input_scale = torch.nn.Parameter(
-                        w2_input_scale, requires_grad=False)
+            else:
+                w13_weight = layer.w13_weight.data
+                w13_weight_scale_inv = layer.w13_weight_scale_inv.data
+                w2_weight = layer.w2_weight
+                w2_weight_scale_inv = layer.w2_weight_scale_inv
+
+            # torch.compile() cannot use Parameter subclasses.
+            layer.w13_weight = Parameter(w13_weight, requires_grad=False)
+            layer.w13_weight_scale_inv = Parameter(w13_weight_scale_inv,
+                                                   requires_grad=False)
+            layer.w2_weight = Parameter(w2_weight, requires_grad=False)
+            layer.w2_weight_scale_inv = Parameter(w2_weight_scale_inv,
+                                                  requires_grad=False)
             return
+
         # If checkpoint is fp16, quantize in place.
         if not self.quant_config.is_checkpoint_fp8_serialized:
             # If rocm, use float8_e4m3fnuz as dtype
diff --git a/vllm/model_executor/layers/quantization/gguf.py b/vllm/model_executor/layers/quantization/gguf.py
index f0943efa0039d..86e6dbb5a5fbe 100644
--- a/vllm/model_executor/layers/quantization/gguf.py
+++ b/vllm/model_executor/layers/quantization/gguf.py
@@ -1,3 +1,5 @@
+# SPDX-License-Identifier: Apache-2.0
+
 from typing import Any, Dict, List, Optional
 
 import gguf
diff --git a/vllm/model_executor/layers/quantization/gptq.py b/vllm/model_executor/layers/quantization/gptq.py
index abafad0f1047e..0cb77a7546d1a 100644
--- a/vllm/model_executor/layers/quantization/gptq.py
+++ b/vllm/model_executor/layers/quantization/gptq.py
@@ -1,3 +1,5 @@
+# SPDX-License-Identifier: Apache-2.0
+
 import enum
 from enum import Enum
 from fractions import Fraction
diff --git a/vllm/model_executor/layers/quantization/gptq_marlin.py b/vllm/model_executor/layers/quantization/gptq_marlin.py
index 4dc4b052b0410..99ab299958b4d 100644
--- a/vllm/model_executor/layers/quantization/gptq_marlin.py
+++ b/vllm/model_executor/layers/quantization/gptq_marlin.py
@@ -1,3 +1,5 @@
+# SPDX-License-Identifier: Apache-2.0
+
 from typing import Any, Callable, Dict, List, Optional, Set, Union
 
 import torch
diff --git a/vllm/model_executor/layers/quantization/gptq_marlin_24.py b/vllm/model_executor/layers/quantization/gptq_marlin_24.py
index 07552c0f13348..cec984483fd8c 100644
--- a/vllm/model_executor/layers/quantization/gptq_marlin_24.py
+++ b/vllm/model_executor/layers/quantization/gptq_marlin_24.py
@@ -1,3 +1,5 @@
+# SPDX-License-Identifier: Apache-2.0
+
 from typing import Any, Dict, List, Optional
 
 import torch
diff --git a/vllm/model_executor/layers/quantization/hqq_marlin.py b/vllm/model_executor/layers/quantization/hqq_marlin.py
index 28538d2993355..432f43688ff58 100644
--- a/vllm/model_executor/layers/quantization/hqq_marlin.py
+++ b/vllm/model_executor/layers/quantization/hqq_marlin.py
@@ -1,3 +1,5 @@
+# SPDX-License-Identifier: Apache-2.0
+
 from typing import Any, Dict, List, Optional
 
 import torch
diff --git a/vllm/model_executor/layers/quantization/ipex_quant.py b/vllm/model_executor/layers/quantization/ipex_quant.py
index c16a962134d06..2531170ececf9 100644
--- a/vllm/model_executor/layers/quantization/ipex_quant.py
+++ b/vllm/model_executor/layers/quantization/ipex_quant.py
@@ -1,3 +1,5 @@
+# SPDX-License-Identifier: Apache-2.0
+
 from typing import Any, Dict, List, Optional
 
 import torch
diff --git a/vllm/model_executor/layers/quantization/kernels/mixed_precision/MPLinearKernel.py b/vllm/model_executor/layers/quantization/kernels/mixed_precision/MPLinearKernel.py
index b04612a9b00d9..c06befaf3b5ad 100644
--- a/vllm/model_executor/layers/quantization/kernels/mixed_precision/MPLinearKernel.py
+++ b/vllm/model_executor/layers/quantization/kernels/mixed_precision/MPLinearKernel.py
@@ -1,3 +1,5 @@
+# SPDX-License-Identifier: Apache-2.0
+
 from abc import ABC, abstractmethod
 from dataclasses import dataclass
 from typing import Callable, Optional, Tuple
@@ -73,12 +75,12 @@ class MPLinearKernel(ABC):
                 torch.nn.Parameter(new_param.data, requires_grad=False))
 
     def _get_weight_params(
-            self, layer: torch.nn.Module
-    ) -> Tuple[torch.Tensor,  # w_q
-               torch.Tensor,  # w_s
-               Optional[torch.Tensor],  # w_zp, 
-               Optional[torch.Tensor]  # w_gidx
-               ]:
+            self, layer: torch.nn.Module) -> Tuple[
+                torch.Tensor,  # w_q
+                torch.Tensor,  # w_s
+                Optional[torch.Tensor],  # w_zp, 
+                Optional[torch.Tensor]  # w_gidx
+            ]:
         return (
             getattr(layer, self.w_q_name),
             getattr(layer, self.w_s_name),
diff --git a/vllm/model_executor/layers/quantization/kernels/mixed_precision/__init__.py b/vllm/model_executor/layers/quantization/kernels/mixed_precision/__init__.py
index 83549870e3f0b..bcfdb16777166 100644
--- a/vllm/model_executor/layers/quantization/kernels/mixed_precision/__init__.py
+++ b/vllm/model_executor/layers/quantization/kernels/mixed_precision/__init__.py
@@ -1,3 +1,5 @@
+# SPDX-License-Identifier: Apache-2.0
+
 from typing import List, Optional, Type
 
 import vllm.envs as envs
diff --git a/vllm/model_executor/layers/quantization/kernels/mixed_precision/exllama.py b/vllm/model_executor/layers/quantization/kernels/mixed_precision/exllama.py
index 1d85d62ec83ee..2706fbb539ab4 100644
--- a/vllm/model_executor/layers/quantization/kernels/mixed_precision/exllama.py
+++ b/vllm/model_executor/layers/quantization/kernels/mixed_precision/exllama.py
@@ -1,3 +1,5 @@
+# SPDX-License-Identifier: Apache-2.0
+
 from typing import Optional, Tuple
 
 import torch
diff --git a/vllm/model_executor/layers/quantization/kernels/mixed_precision/machete.py b/vllm/model_executor/layers/quantization/kernels/mixed_precision/machete.py
index 15df0200f30b5..3f0586f6e30d6 100644
--- a/vllm/model_executor/layers/quantization/kernels/mixed_precision/machete.py
+++ b/vllm/model_executor/layers/quantization/kernels/mixed_precision/machete.py
@@ -1,3 +1,5 @@
+# SPDX-License-Identifier: Apache-2.0
+
 from functools import partial
 from typing import Optional, Tuple
 
diff --git a/vllm/model_executor/layers/quantization/kernels/mixed_precision/marlin.py b/vllm/model_executor/layers/quantization/kernels/mixed_precision/marlin.py
index 6969583d6d473..e21801cf6a785 100644
--- a/vllm/model_executor/layers/quantization/kernels/mixed_precision/marlin.py
+++ b/vllm/model_executor/layers/quantization/kernels/mixed_precision/marlin.py
@@ -1,3 +1,5 @@
+# SPDX-License-Identifier: Apache-2.0
+
 from typing import Optional, Tuple
 
 import torch
diff --git a/vllm/model_executor/layers/quantization/kernels/scaled_mm/ScaledMMLinearKernel.py b/vllm/model_executor/layers/quantization/kernels/scaled_mm/ScaledMMLinearKernel.py
index 75cf91f191136..91e7654053f9d 100644
--- a/vllm/model_executor/layers/quantization/kernels/scaled_mm/ScaledMMLinearKernel.py
+++ b/vllm/model_executor/layers/quantization/kernels/scaled_mm/ScaledMMLinearKernel.py
@@ -1,3 +1,5 @@
+# SPDX-License-Identifier: Apache-2.0
+
 from abc import ABC, abstractmethod
 from dataclasses import dataclass
 from typing import Optional, Tuple
@@ -48,13 +50,13 @@ class ScaledMMLinearKernel(ABC):
         raise NotImplementedError
 
     def _get_weight_params(
-            self, layer: torch.nn.Module
-    ) -> Tuple[torch.Tensor,  # weight
-               torch.Tensor,  # weight_scale
-               Optional[torch.Tensor],  # input_scale, 
-               Optional[torch.Tensor],  # input_zp
-               Optional[torch.Tensor],  # azp_adj
-               ]:
+            self, layer: torch.nn.Module) -> Tuple[
+                torch.Tensor,  # weight
+                torch.Tensor,  # weight_scale
+                Optional[torch.Tensor],  # input_scale, 
+                Optional[torch.Tensor],  # input_zp
+                Optional[torch.Tensor],  # azp_adj
+            ]:
         return (
             getattr(layer, self.w_q_name),
             getattr(layer, self.w_s_name),
diff --git a/vllm/model_executor/layers/quantization/kernels/scaled_mm/__init__.py b/vllm/model_executor/layers/quantization/kernels/scaled_mm/__init__.py
index 4824a11804163..a5967995ac88d 100644
--- a/vllm/model_executor/layers/quantization/kernels/scaled_mm/__init__.py
+++ b/vllm/model_executor/layers/quantization/kernels/scaled_mm/__init__.py
@@ -1,3 +1,5 @@
+# SPDX-License-Identifier: Apache-2.0
+
 import os
 from typing import Dict, List, Optional, Type
 
diff --git a/vllm/model_executor/layers/quantization/kernels/scaled_mm/cutlass.py b/vllm/model_executor/layers/quantization/kernels/scaled_mm/cutlass.py
index 2e83a04286a0d..2bf21a05c46d9 100644
--- a/vllm/model_executor/layers/quantization/kernels/scaled_mm/cutlass.py
+++ b/vllm/model_executor/layers/quantization/kernels/scaled_mm/cutlass.py
@@ -1,3 +1,5 @@
+# SPDX-License-Identifier: Apache-2.0
+
 from typing import Optional, Tuple
 
 import torch
diff --git a/vllm/model_executor/layers/quantization/kernels/scaled_mm/triton.py b/vllm/model_executor/layers/quantization/kernels/scaled_mm/triton.py
index 97ec8cb0500d7..5da5df8efaeb0 100644
--- a/vllm/model_executor/layers/quantization/kernels/scaled_mm/triton.py
+++ b/vllm/model_executor/layers/quantization/kernels/scaled_mm/triton.py
@@ -1,3 +1,5 @@
+# SPDX-License-Identifier: Apache-2.0
+
 from typing import Optional, Tuple
 
 import torch
diff --git a/vllm/model_executor/layers/quantization/kernels/scaled_mm/xla.py b/vllm/model_executor/layers/quantization/kernels/scaled_mm/xla.py
index 9de668e658826..0bf090d7fab3c 100644
--- a/vllm/model_executor/layers/quantization/kernels/scaled_mm/xla.py
+++ b/vllm/model_executor/layers/quantization/kernels/scaled_mm/xla.py
@@ -1,3 +1,5 @@
+# SPDX-License-Identifier: Apache-2.0
+
 import warnings
 from typing import Optional, Tuple
 
diff --git a/vllm/model_executor/layers/quantization/kv_cache.py b/vllm/model_executor/layers/quantization/kv_cache.py
index e1870c73cc932..388a4f16699c5 100644
--- a/vllm/model_executor/layers/quantization/kv_cache.py
+++ b/vllm/model_executor/layers/quantization/kv_cache.py
@@ -1,3 +1,5 @@
+# SPDX-License-Identifier: Apache-2.0
+
 import torch
 
 from vllm.logger import init_logger
diff --git a/vllm/model_executor/layers/quantization/marlin.py b/vllm/model_executor/layers/quantization/marlin.py
index 20212e672eab0..4cf0c677c0794 100644
--- a/vllm/model_executor/layers/quantization/marlin.py
+++ b/vllm/model_executor/layers/quantization/marlin.py
@@ -1,3 +1,5 @@
+# SPDX-License-Identifier: Apache-2.0
+
 from typing import Any, Dict, List, Optional
 
 import torch
diff --git a/vllm/model_executor/layers/quantization/modelopt.py b/vllm/model_executor/layers/quantization/modelopt.py
index a1b3eeb43cbee..348e9bccd9b0a 100644
--- a/vllm/model_executor/layers/quantization/modelopt.py
+++ b/vllm/model_executor/layers/quantization/modelopt.py
@@ -1,3 +1,5 @@
+# SPDX-License-Identifier: Apache-2.0
+
 from typing import Any, Dict, List, Optional
 
 import torch
diff --git a/vllm/model_executor/layers/quantization/moe_wna16.py b/vllm/model_executor/layers/quantization/moe_wna16.py
new file mode 100644
index 0000000000000..56fa597e20131
--- /dev/null
+++ b/vllm/model_executor/layers/quantization/moe_wna16.py
@@ -0,0 +1,425 @@
+# SPDX-License-Identifier: Apache-2.0
+
+from typing import Any, Callable, Dict, List, Optional
+
+import torch
+
+from vllm.distributed import get_tensor_model_parallel_rank, get_tp_group
+from vllm.model_executor.layers.fused_moe.layer import (
+    FusedMoE, FusedMoEMethodBase, FusedMoeWeightScaleSupported)
+from vllm.model_executor.layers.linear import (LinearBase,
+                                               UnquantizedLinearMethod)
+from vllm.model_executor.layers.quantization.awq import AWQConfig
+from vllm.model_executor.layers.quantization.awq_marlin import AWQMarlinConfig
+from vllm.model_executor.layers.quantization.base_config import (
+    QuantizationConfig, QuantizeMethodBase)
+from vllm.model_executor.layers.quantization.gptq import GPTQConfig
+from vllm.model_executor.layers.quantization.gptq_marlin import (
+    GPTQMarlinConfig)
+from vllm.model_executor.utils import set_weight_attrs
+from vllm.platforms import current_platform
+
+
+class MoeWNA16Config(QuantizationConfig):
+    """Config class for MOE WNA16 (W8A16/W4A16) quantization."""
+
+    def __init__(self, linear_quant_method: str, weight_bits: int,
+                 group_size: int, has_zp: bool, lm_head_quantized: bool,
+                 modules_to_not_convert: Optional[List[str]],
+                 full_config: Dict[str, Any]) -> None:
+        self.weight_bits = weight_bits
+        self.group_size = group_size
+        self.has_zp = has_zp
+        self.bit8_pack_factor = 8 // self.weight_bits
+        self.lm_head_quantized = lm_head_quantized
+        self.linear_quant_method = linear_quant_method
+        self.full_config = full_config
+        self.use_marlin = False
+        if self.linear_quant_method == "gptq":
+            self.use_marlin = GPTQMarlinConfig.is_gptq_marlin_compatible(
+                full_config)
+        elif self.linear_quant_method == "awq":
+            capability_tuple = current_platform.get_device_capability()
+            device_capability = (-1 if capability_tuple is None else
+                                 capability_tuple.to_int())
+            awq_min_capability = AWQConfig.get_min_capability()
+            if device_capability < awq_min_capability:
+                raise ValueError(
+                    "The quantization method moe_wna16 + awq is not supported "
+                    "for the current GPU. "
+                    f"Minimum capability: {awq_min_capability}. "
+                    f"Current capability: {device_capability}.")
+            self.use_marlin = AWQMarlinConfig.is_awq_marlin_compatible(
+                full_config)
+        else:
+            raise ValueError("moe_wna16 only support gptq and awq.")
+
+        if modules_to_not_convert is None:
+            self.modules_to_not_convert = []
+        else:
+            self.modules_to_not_convert = modules_to_not_convert
+
+    @classmethod
+    def get_name(cls) -> str:
+        return "moe_wna16"
+
+    @classmethod
+    def get_supported_act_dtypes(cls) -> List[torch.dtype]:
+        return [torch.bfloat16, torch.half]
+
+    @classmethod
+    def get_min_capability(cls) -> int:
+        return 70
+
+    @classmethod
+    def get_config_filenames(cls) -> List[str]:
+        return ["quantize_config.json"]
+
+    @classmethod
+    def from_config(cls, config: Dict[str, Any]) -> "MoeWNA16Config":
+        linear_quant_method = cls.get_from_keys(config, ["quant_method"])
+        weight_bits = cls.get_from_keys(config, ["bits"])
+        group_size = cls.get_from_keys(config, ["group_size"])
+        lm_head_quantized = cls.get_from_keys_or(config, ["lm_head"],
+                                                 default=False)
+        if linear_quant_method == "gptq":
+            has_zp = not cls.get_from_keys(config, ["sym"])
+            modules_to_not_convert = []
+        elif linear_quant_method == "awq":
+            has_zp = cls.get_from_keys(config, ["zero_point"])
+            modules_to_not_convert = cls.get_from_keys(
+                config, ["modules_to_not_convert"])
+        else:
+            raise ValueError("moe_wna16 only support gptq and awq.")
+
+        return cls(linear_quant_method, weight_bits, group_size, has_zp,
+                   lm_head_quantized, modules_to_not_convert, config)
+
+    @classmethod
+    def override_quantization_method(cls, hf_quant_cfg,
+                                     user_quant) -> Optional[str]:
+        can_convert = cls.is_moe_wna16_compatible(hf_quant_cfg)
+        if can_convert and user_quant == "moe_wna16":
+            return cls.get_name()
+        return None
+
+    @classmethod
+    def is_moe_wna16_compatible(cls, quant_config: Dict[str, Any]):
+        # Extract data from quant config.
+        quant_method = quant_config.get("quant_method", "").lower()
+        num_bits = quant_config.get("bits")
+        desc_act = quant_config.get("desc_act")
+
+        capability_tuple = current_platform.get_device_capability()
+        device_capability = (-1 if capability_tuple is None else
+                             capability_tuple.to_int())
+        awq_min_capability = AWQConfig.get_min_capability()
+
+        gptq_compatible = quant_method == "gptq" and \
+                not desc_act and num_bits in [4, 8]
+        awq_compatible = quant_method == "awq" and num_bits == 4 and \
+            device_capability >= awq_min_capability
+
+        return gptq_compatible or awq_compatible
+
+    def get_quant_method(self, layer: torch.nn.Module,
+                         prefix: str) -> Optional["QuantizeMethodBase"]:
+        if is_layer_skipped_quant(prefix, self.modules_to_not_convert):
+            return UnquantizedLinearMethod()
+        elif isinstance(layer, LinearBase):
+            if self.linear_quant_method == "gptq":
+                if self.use_marlin:
+                    return GPTQMarlinConfig.from_config(
+                        self.full_config).get_quant_method(layer, prefix)
+                else:
+                    return GPTQConfig.from_config(
+                        self.full_config).get_quant_method(layer, prefix)
+            elif self.linear_quant_method == "awq":
+                if self.use_marlin:
+                    return AWQMarlinConfig.from_config(
+                        self.full_config).get_quant_method(layer, prefix)
+                else:
+                    return AWQConfig.from_config(
+                        self.full_config).get_quant_method(layer, prefix)
+            else:
+                raise ValueError("moe_wna16 only support gptq and awq.")
+        elif isinstance(layer, FusedMoE):
+            return MoeWNA16Method(self)
+        return None
+
+
+def is_layer_skipped_quant(prefix: str, modules_to_not_convert: List[str]):
+    return any(module_name in prefix for module_name in modules_to_not_convert)
+
+
+class MoeWNA16Method(FusedMoEMethodBase):
+    """Linear method for MOE WNA16 (W8A16/W4A16) quantization.
+
+    Args:
+        quant_config: The MOE WNA16 (W8A16/W4A16) quantization config.
+    """
+
+    def __init__(self, quant_config: MoeWNA16Config):
+        self.quant_config = quant_config
+
+    def create_weights(self, layer: torch.nn.Module, num_experts: int,
+                       hidden_size: int, intermediate_size_per_partition: int,
+                       params_dtype: torch.dtype, **extra_weight_attrs):
+
+        layer.quant_config = self.quant_config
+        bit8_pack_factor = self.quant_config.bit8_pack_factor
+        group_size = self.quant_config.group_size
+        group_size_div_factor = 1
+
+        # make intermediate_size and hidden_size diviable by group_size
+        # we reduce the group size to ensure that
+        # and we would repeat the loaded_weight later
+        while intermediate_size_per_partition % group_size or \
+                hidden_size % group_size:
+            group_size = group_size // 2
+            group_size_div_factor *= 2
+            assert group_size >= 32
+        layer.group_size = group_size
+        layer.group_size_div_factor = group_size_div_factor
+
+        strategy = FusedMoeWeightScaleSupported.GROUP.value
+        extra_weight_attrs.update({
+            "quant_method": strategy,
+            "is_transposed": False
+        })
+
+        assert 'weight_loader' in extra_weight_attrs
+        weight_loader = extra_weight_attrs['weight_loader']
+        wrapped_weight_loader = MoeWNA16Method.get_weight_loader(
+            layer, weight_loader)
+        extra_weight_attrs['weight_loader'] = wrapped_weight_loader
+
+        # Fused gate_up_proj (column parallel)
+        w13_qweight = torch.nn.Parameter(torch.empty(
+            num_experts,
+            2 * intermediate_size_per_partition,
+            hidden_size // bit8_pack_factor,
+            dtype=torch.uint8),
+                                         requires_grad=False)
+        layer.register_parameter("w13_qweight", w13_qweight)
+        set_weight_attrs(w13_qweight, extra_weight_attrs)
+
+        # down_proj (row parallel)
+        w2_qweight = torch.nn.Parameter(torch.empty(
+            num_experts,
+            hidden_size,
+            intermediate_size_per_partition // bit8_pack_factor,
+            dtype=torch.uint8),
+                                        requires_grad=False)
+        layer.register_parameter("w2_qweight", w2_qweight)
+        set_weight_attrs(w2_qweight, extra_weight_attrs)
+
+        w13_scales = torch.nn.Parameter(torch.zeros(
+            num_experts,
+            2 * intermediate_size_per_partition,
+            hidden_size // group_size,
+            dtype=params_dtype),
+                                        requires_grad=False)
+        layer.register_parameter("w13_scales", w13_scales)
+        set_weight_attrs(w13_scales, extra_weight_attrs)
+
+        w2_scales = torch.nn.Parameter(torch.zeros(
+            num_experts,
+            hidden_size,
+            intermediate_size_per_partition // group_size,
+            dtype=params_dtype),
+                                       requires_grad=False)
+        layer.register_parameter("w2_scales", w2_scales)
+        set_weight_attrs(w2_scales, extra_weight_attrs)
+
+        if self.quant_config.has_zp:
+            w13_qzeros = torch.nn.Parameter(torch.zeros(
+                num_experts,
+                2 * intermediate_size_per_partition // bit8_pack_factor,
+                hidden_size // group_size,
+                dtype=torch.uint8),
+                                            requires_grad=False)
+            layer.register_parameter("w13_qzeros", w13_qzeros)
+            set_weight_attrs(w13_qzeros, extra_weight_attrs)
+
+            w2_qzeros = torch.nn.Parameter(torch.zeros(
+                num_experts,
+                hidden_size // bit8_pack_factor,
+                intermediate_size_per_partition // group_size,
+                dtype=torch.uint8),
+                                           requires_grad=False)
+            layer.register_parameter("w2_qzeros", w2_qzeros)
+            set_weight_attrs(w2_qzeros, extra_weight_attrs)
+
+        if self.quant_config.linear_quant_method == "gptq":
+            # some param are unused, but we need to init them in order to
+            # load weights
+            invalid_param_keys = ["w13_g_idx", "w2_g_idx"]
+            if not self.quant_config.has_zp:
+                invalid_param_keys += ["w13_qzeros", "w2_qzeros"]
+            for key in invalid_param_keys:
+                param = torch.nn.Parameter(torch.empty((0, ),
+                                                       dtype=torch.int32),
+                                           requires_grad=False)
+                layer.register_parameter(key, param)
+                set_weight_attrs(param, extra_weight_attrs)
+
+    def apply(
+        self,
+        layer: torch.nn.Module,
+        x: torch.Tensor,
+        router_logits: torch.Tensor,
+        top_k: int,
+        renormalize: bool,
+        use_grouped_topk: bool = False,
+        topk_group: Optional[int] = None,
+        num_expert_group: Optional[int] = None,
+        custom_routing_function: Optional[Callable] = None,
+        scoring_func: str = "softmax",
+        e_score_correction_bias: Optional[torch.Tensor] = None,
+    ) -> torch.Tensor:
+        from vllm.model_executor.layers.fused_moe import fused_experts
+
+        topk_weights, topk_ids = FusedMoE.select_experts(
+            hidden_states=x,
+            router_logits=router_logits,
+            use_grouped_topk=use_grouped_topk,
+            top_k=top_k,
+            renormalize=renormalize,
+            topk_group=topk_group,
+            num_expert_group=num_expert_group,
+            custom_routing_function=custom_routing_function,
+            scoring_func=scoring_func,
+            e_score_correction_bias=e_score_correction_bias)
+
+        weight_bits = self.quant_config.weight_bits
+        has_zp = self.quant_config.has_zp
+
+        return fused_experts(x,
+                             layer.w13_qweight,
+                             layer.w2_qweight,
+                             topk_weights=topk_weights,
+                             topk_ids=topk_ids,
+                             inplace=True,
+                             use_int4_w4a16=weight_bits == 4,
+                             use_int8_w8a16=weight_bits == 8,
+                             w1_scale=layer.w13_scales,
+                             w2_scale=layer.w2_scales,
+                             w1_zp=layer.w13_qzeros if has_zp else None,
+                             w2_zp=layer.w2_qzeros if has_zp else None,
+                             block_shape=[0, layer.group_size])
+
+    @staticmethod
+    def get_weight_loader(layer, weight_loader):
+
+        def convert_awq_tensor(tensor, tensor_type):
+            # convert awq qweight/qzeros to a standard format (assume int4)
+            # qweight: (k, n // pack_factor_bit32) -> (n, k // pack_factor_bit8)
+            # qzeros: (k // group_size, n // pack_factor_bit32) ->
+            #         (n // pack_factor_bit8, k // group_size)
+            # pack_factor_bit32 = 32 // weight_bits
+            # pack_factor_bit8 = 8 // weight_bits
+
+            # 0. suppose origin shape (a, b), dtype int32
+            # 1. convert to uint8, shape (a, b) -> (a, 4 * b)
+            size0 = tensor.size(0)
+            tensor = tensor.view(torch.uint8)
+
+            # 2. unpack to uint4 (only when weight_bits == 4)
+            #    shape (a, 4 * b) -> (a, 4 * b, 2)
+            shifter = torch.tensor([0, 4],
+                                   dtype=torch.uint8,
+                                   device=tensor.device)
+            tensor = (tensor[:, :, None] >> shifter) & 0xF
+
+            # 3. change order, see
+            # https://github.com/casper-hansen/AutoAWQ/blob/v0.2.8/awq/utils/quant_utils.py
+            # shape -> (a, 4 * b * pack_factor_bit8)
+            reverse_awq_pack_order = [0, 4, 1, 5, 2, 6, 3, 7]
+            tensor = tensor.view(-1, 8)[:, reverse_awq_pack_order]
+            tensor = tensor.view(size0, -1)
+
+            # 4. transpose, shape -> (4 * b * pack_factor_bit8, a)
+            tensor = tensor.T.contiguous()
+
+            # 5. repack (only when weight_bits == 4)
+            # qweight shape -> (4 * b * pack_factor_bit8, a // pack_factor_bit8)
+            # qzeros shape -> (4 * b, a)
+
+            if tensor_type == "qweight":
+                tensor = tensor[:, 1::2] * 16 + tensor[:, ::2]
+            elif tensor_type == "qzeros":
+                tensor = tensor[1::2, :] * 16 + tensor[::2, :]
+            return tensor
+
+        def convert_gptq_int4_qzeros(tensor):
+            tensor = tensor.view(torch.uint8)
+            shifter = torch.tensor([0, 4],
+                                   dtype=torch.uint8,
+                                   device=tensor.device)
+            tensor = (tensor[:, :, None] >> shifter) & 0xF
+            tensor = tensor + 1
+            tensor = tensor[:, :, 0] + tensor[:, :, 1] * 16
+            return tensor
+
+        def moe_wna16_weight_loader(param: torch.nn.Parameter,
+                                    loaded_weight: torch.Tensor,
+                                    weight_name: str, shard_id: str,
+                                    expert_id: int):
+            if "g_idx" in weight_name:
+                return
+            if not layer.quant_config.has_zp and "qzeros" in weight_name:
+                return
+
+            device = get_tp_group().device
+            tp_rank = get_tensor_model_parallel_rank()
+            loaded_weight = loaded_weight.to(device)
+            shard_size = layer.intermediate_size_per_partition
+
+            # convert gptq and awq weight to a standard format
+            if layer.quant_config.linear_quant_method == "awq":
+                assert layer.quant_config.weight_bits == 4
+                if "weight" in weight_name:
+                    loaded_weight = convert_awq_tensor(loaded_weight,
+                                                       "qweight")
+                elif "zeros" in weight_name:
+                    loaded_weight = convert_awq_tensor(loaded_weight, "qzeros")
+                else:
+                    loaded_weight = loaded_weight.T
+            elif layer.quant_config.linear_quant_method == "gptq":
+                assert layer.quant_config.weight_bits in [4, 8]
+                if "weight" in weight_name:
+                    loaded_weight = loaded_weight.T.contiguous().view(
+                        torch.uint8)
+                elif "zeros" in weight_name:
+                    # add 1 to gptq qzeros to align with awq
+                    loaded_weight = loaded_weight.view(torch.uint8)
+                    if layer.quant_config.weight_bits == 4:
+                        loaded_weight = convert_gptq_int4_qzeros(
+                            loaded_weight).T
+                    else:
+                        loaded_weight = loaded_weight.T + 1
+                else:
+                    loaded_weight = loaded_weight.T
+
+            # repeat the qzeros/scales to fit new group size
+            if layer.group_size_div_factor > 1 and \
+                    "qzeros" in weight_name or "scales" in weight_name:
+                loaded_weight = loaded_weight.repeat_interleave(
+                    layer.group_size_div_factor, 1)
+
+            if "w13_qzeros" in weight_name:
+                tensor = loaded_weight.view(layer.tp_size, -1,
+                                            loaded_weight.size(1))[tp_rank]
+                if shard_id == "w1":
+                    param.data[expert_id, :shard_size // 2] = tensor
+                else:
+                    param.data[expert_id, shard_size // 2:] = tensor
+            elif "w2_qzeros" in weight_name:
+                param.data[expert_id] = loaded_weight.view(
+                    loaded_weight.size(0), layer.tp_size, -1)[:, tp_rank]
+            else:
+                weight_loader(param, loaded_weight, weight_name, shard_id,
+                              expert_id)
+
+        return moe_wna16_weight_loader
diff --git a/vllm/model_executor/layers/quantization/neuron_quant.py b/vllm/model_executor/layers/quantization/neuron_quant.py
index 2d5cdfa165775..a8e8be207fd15 100644
--- a/vllm/model_executor/layers/quantization/neuron_quant.py
+++ b/vllm/model_executor/layers/quantization/neuron_quant.py
@@ -1,3 +1,5 @@
+# SPDX-License-Identifier: Apache-2.0
+
 import os
 from importlib.util import find_spec
 from typing import Any, Dict, List, Optional
diff --git a/vllm/model_executor/layers/quantization/qqq.py b/vllm/model_executor/layers/quantization/qqq.py
index 2ccd082029610..6e9d3dc6cb378 100644
--- a/vllm/model_executor/layers/quantization/qqq.py
+++ b/vllm/model_executor/layers/quantization/qqq.py
@@ -1,3 +1,5 @@
+# SPDX-License-Identifier: Apache-2.0
+
 from typing import Any, Dict, List, Optional
 
 import torch
diff --git a/vllm/model_executor/layers/quantization/quark/quark.py b/vllm/model_executor/layers/quantization/quark/quark.py
index fc214255eca71..0451cf82b9971 100644
--- a/vllm/model_executor/layers/quantization/quark/quark.py
+++ b/vllm/model_executor/layers/quantization/quark/quark.py
@@ -1,3 +1,5 @@
+# SPDX-License-Identifier: Apache-2.0
+
 import fnmatch
 import re
 from typing import Any, Dict, List, Optional, cast
diff --git a/vllm/model_executor/layers/quantization/quark/quark_moe.py b/vllm/model_executor/layers/quantization/quark/quark_moe.py
index 68a3954540763..98743b15e4b25 100644
--- a/vllm/model_executor/layers/quantization/quark/quark_moe.py
+++ b/vllm/model_executor/layers/quantization/quark/quark_moe.py
@@ -1,3 +1,5 @@
+# SPDX-License-Identifier: Apache-2.0
+
 from typing import Any, Callable, Dict, Optional
 
 import torch
diff --git a/vllm/model_executor/layers/quantization/quark/schemes/__init__.py b/vllm/model_executor/layers/quantization/quark/schemes/__init__.py
index fb0ba9bd5220c..9069b5a0d515d 100644
--- a/vllm/model_executor/layers/quantization/quark/schemes/__init__.py
+++ b/vllm/model_executor/layers/quantization/quark/schemes/__init__.py
@@ -1,3 +1,5 @@
+# SPDX-License-Identifier: Apache-2.0
+
 from .quark_scheme import QuarkScheme
 from .quark_w8a8_fp8 import QuarkW8A8Fp8
 from .quark_w8a8_int8 import QuarkW8A8Int8
diff --git a/vllm/model_executor/layers/quantization/quark/schemes/quark_scheme.py b/vllm/model_executor/layers/quantization/quark/schemes/quark_scheme.py
index 239597fa4be0e..40c8ea86d3c38 100644
--- a/vllm/model_executor/layers/quantization/quark/schemes/quark_scheme.py
+++ b/vllm/model_executor/layers/quantization/quark/schemes/quark_scheme.py
@@ -1,3 +1,5 @@
+# SPDX-License-Identifier: Apache-2.0
+
 from abc import ABC, abstractmethod
 from typing import Optional
 
diff --git a/vllm/model_executor/layers/quantization/quark/schemes/quark_w8a8_fp8.py b/vllm/model_executor/layers/quantization/quark/schemes/quark_w8a8_fp8.py
index 206931ea2ffc0..c885e98a4d66e 100644
--- a/vllm/model_executor/layers/quantization/quark/schemes/quark_w8a8_fp8.py
+++ b/vllm/model_executor/layers/quantization/quark/schemes/quark_w8a8_fp8.py
@@ -1,3 +1,5 @@
+# SPDX-License-Identifier: Apache-2.0
+
 from typing import Callable, List, Optional
 
 import torch
diff --git a/vllm/model_executor/layers/quantization/quark/schemes/quark_w8a8_int8.py b/vllm/model_executor/layers/quantization/quark/schemes/quark_w8a8_int8.py
index 8cb47e9c37e56..1bf34b098938c 100644
--- a/vllm/model_executor/layers/quantization/quark/schemes/quark_w8a8_int8.py
+++ b/vllm/model_executor/layers/quantization/quark/schemes/quark_w8a8_int8.py
@@ -1,3 +1,5 @@
+# SPDX-License-Identifier: Apache-2.0
+
 from typing import Callable, List, Optional, Set
 
 import torch
diff --git a/vllm/model_executor/layers/quantization/quark/utils.py b/vllm/model_executor/layers/quantization/quark/utils.py
index 742a629bdb1c5..afb1d9d63e73a 100644
--- a/vllm/model_executor/layers/quantization/quark/utils.py
+++ b/vllm/model_executor/layers/quantization/quark/utils.py
@@ -1,3 +1,5 @@
+# SPDX-License-Identifier: Apache-2.0
+
 import re
 from typing import Any, Iterable, Optional
 
diff --git a/vllm/model_executor/layers/quantization/schema.py b/vllm/model_executor/layers/quantization/schema.py
index a26c524787a0b..026881f2dbaac 100644
--- a/vllm/model_executor/layers/quantization/schema.py
+++ b/vllm/model_executor/layers/quantization/schema.py
@@ -1,3 +1,4 @@
+# SPDX-License-Identifier: Apache-2.0
 """
 This file contains the Pydantic schemas for various quantization-related
 parameters. When a relevant quantization technique is specified, these
diff --git a/vllm/model_executor/layers/quantization/tpu_int8.py b/vllm/model_executor/layers/quantization/tpu_int8.py
index 605c3a38644ac..3234fecaa3b35 100644
--- a/vllm/model_executor/layers/quantization/tpu_int8.py
+++ b/vllm/model_executor/layers/quantization/tpu_int8.py
@@ -1,3 +1,5 @@
+# SPDX-License-Identifier: Apache-2.0
+
 from typing import Any, Dict, List, Optional, Tuple
 
 import torch
diff --git a/vllm/model_executor/layers/quantization/utils/__init__.py b/vllm/model_executor/layers/quantization/utils/__init__.py
index e60f0c79ac1f7..f7ee472885140 100644
--- a/vllm/model_executor/layers/quantization/utils/__init__.py
+++ b/vllm/model_executor/layers/quantization/utils/__init__.py
@@ -1,3 +1,5 @@
+# SPDX-License-Identifier: Apache-2.0
+
 from .layer_utils import replace_parameter, update_tensor_inplace
 
 __all__ = ['update_tensor_inplace', 'replace_parameter']
diff --git a/vllm/model_executor/layers/quantization/utils/configs/N=1536,K=1536,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json b/vllm/model_executor/layers/quantization/utils/configs/N=1536,K=1536,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json
new file mode 100644
index 0000000000000..6496a38fba8ae
--- /dev/null
+++ b/vllm/model_executor/layers/quantization/utils/configs/N=1536,K=1536,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json	
@@ -0,0 +1,146 @@
+{
+    "1": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "2": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "4": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "8": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "16": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "24": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "32": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "48": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "64": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "96": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "128": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "256": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "512": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "1024": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "1536": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "2048": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "3072": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "4096": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 3
+    }
+}
diff --git a/vllm/model_executor/layers/quantization/utils/configs/N=1536,K=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json b/vllm/model_executor/layers/quantization/utils/configs/N=1536,K=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json
new file mode 100644
index 0000000000000..3618053b65831
--- /dev/null
+++ b/vllm/model_executor/layers/quantization/utils/configs/N=1536,K=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json	
@@ -0,0 +1,146 @@
+{
+    "1": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "2": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "4": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "8": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "16": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "24": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "32": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "48": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "64": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "96": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "128": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "256": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "512": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "1024": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "1536": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "2048": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "3072": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "4096": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 3
+    }
+}
diff --git a/vllm/model_executor/layers/quantization/utils/configs/N=1536,K=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json b/vllm/model_executor/layers/quantization/utils/configs/N=1536,K=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json
new file mode 100644
index 0000000000000..46a982f5ee9a4
--- /dev/null
+++ b/vllm/model_executor/layers/quantization/utils/configs/N=1536,K=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json	
@@ -0,0 +1,146 @@
+{
+    "1": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "2": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "4": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "8": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "16": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "24": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "32": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "48": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "64": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "96": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "128": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "256": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "512": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "1024": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "1536": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "2048": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "3072": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "4096": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 3
+    }
+}
diff --git a/vllm/model_executor/layers/quantization/utils/configs/N=2048,K=512,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json b/vllm/model_executor/layers/quantization/utils/configs/N=2048,K=512,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json
new file mode 100644
index 0000000000000..035ec027fa566
--- /dev/null
+++ b/vllm/model_executor/layers/quantization/utils/configs/N=2048,K=512,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json	
@@ -0,0 +1,146 @@
+{
+    "1": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "2": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "4": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "8": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "16": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "24": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "32": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "48": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "64": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "96": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "128": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "256": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "512": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "1024": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "1536": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "2048": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "3072": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "4096": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 3
+    }
+}
diff --git a/vllm/model_executor/layers/quantization/utils/configs/N=2048,K=512,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json b/vllm/model_executor/layers/quantization/utils/configs/N=2048,K=512,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json
new file mode 100644
index 0000000000000..8b49f2781cb54
--- /dev/null
+++ b/vllm/model_executor/layers/quantization/utils/configs/N=2048,K=512,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json	
@@ -0,0 +1,146 @@
+{
+    "1": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "2": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "4": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "8": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "16": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "24": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "32": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "48": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "64": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "96": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "128": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "256": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "512": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "1024": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "1536": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "2048": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "3072": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "4096": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 3
+    }
+}
diff --git a/vllm/model_executor/layers/quantization/utils/configs/N=2304,K=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json b/vllm/model_executor/layers/quantization/utils/configs/N=2304,K=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json
new file mode 100644
index 0000000000000..851bc9f9f0b50
--- /dev/null
+++ b/vllm/model_executor/layers/quantization/utils/configs/N=2304,K=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json	
@@ -0,0 +1,146 @@
+{
+    "1": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "2": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "4": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "8": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "16": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "24": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "32": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "48": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "64": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "96": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "128": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "256": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "512": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "1024": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "1536": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "2048": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "3072": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "4096": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 3
+    }
+}
diff --git a/vllm/model_executor/layers/quantization/utils/configs/N=2304,K=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json b/vllm/model_executor/layers/quantization/utils/configs/N=2304,K=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json
new file mode 100644
index 0000000000000..d1227c2157990
--- /dev/null
+++ b/vllm/model_executor/layers/quantization/utils/configs/N=2304,K=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json	
@@ -0,0 +1,146 @@
+{
+    "1": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "2": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "4": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "8": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "16": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "24": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "32": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "48": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "64": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "96": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "128": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "256": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "512": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "1024": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "1536": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "2048": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "3072": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "4096": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 3
+    }
+}
diff --git a/vllm/model_executor/layers/quantization/utils/configs/N=24576,K=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json b/vllm/model_executor/layers/quantization/utils/configs/N=24576,K=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json
new file mode 100644
index 0000000000000..1c61451fb34e5
--- /dev/null
+++ b/vllm/model_executor/layers/quantization/utils/configs/N=24576,K=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json	
@@ -0,0 +1,146 @@
+{
+    "1": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 4
+    },
+    "2": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 4
+    },
+    "4": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "8": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "16": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "24": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "32": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "48": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "64": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "96": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "128": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "256": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "512": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "1024": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "1536": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "2048": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "3072": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "4096": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 3
+    }
+}
diff --git a/vllm/model_executor/layers/quantization/utils/configs/N=24576,K=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json b/vllm/model_executor/layers/quantization/utils/configs/N=24576,K=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json
new file mode 100644
index 0000000000000..63e661c80de6a
--- /dev/null
+++ b/vllm/model_executor/layers/quantization/utils/configs/N=24576,K=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json	
@@ -0,0 +1,146 @@
+{
+    "1": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 4
+    },
+    "2": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 8,
+        "num_stages": 4
+    },
+    "4": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "8": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "16": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "24": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "32": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "48": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "64": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "96": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "128": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "256": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "512": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "1024": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "1536": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "2048": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "3072": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "4096": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 3
+    }
+}
diff --git a/vllm/model_executor/layers/quantization/utils/configs/N=256,K=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json b/vllm/model_executor/layers/quantization/utils/configs/N=256,K=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json
new file mode 100644
index 0000000000000..cf354037903c0
--- /dev/null
+++ b/vllm/model_executor/layers/quantization/utils/configs/N=256,K=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json	
@@ -0,0 +1,146 @@
+{
+    "1": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "2": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "4": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "8": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "16": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "24": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "32": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "48": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "64": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "96": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "128": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "256": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "512": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "1024": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "1536": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "2048": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "3072": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "4096": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 4
+    }
+}
diff --git a/vllm/model_executor/layers/quantization/utils/configs/N=3072,K=1536,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json b/vllm/model_executor/layers/quantization/utils/configs/N=3072,K=1536,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json
new file mode 100644
index 0000000000000..eccb86a76df0d
--- /dev/null
+++ b/vllm/model_executor/layers/quantization/utils/configs/N=3072,K=1536,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json	
@@ -0,0 +1,146 @@
+{
+    "1": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "2": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "4": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "8": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "16": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "24": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "32": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "48": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "64": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "96": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "128": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "256": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "512": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "1024": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "1536": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "2048": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "3072": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "4096": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 3
+    }
+}
diff --git a/vllm/model_executor/layers/quantization/utils/configs/N=3072,K=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json b/vllm/model_executor/layers/quantization/utils/configs/N=3072,K=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json
new file mode 100644
index 0000000000000..88af48431d8b8
--- /dev/null
+++ b/vllm/model_executor/layers/quantization/utils/configs/N=3072,K=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json	
@@ -0,0 +1,146 @@
+{
+    "1": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "2": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "4": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "8": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "16": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "24": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "32": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "48": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "64": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "96": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "128": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "256": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "512": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "1024": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "1536": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "2048": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "3072": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "4096": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 3
+    }
+}
diff --git a/vllm/model_executor/layers/quantization/utils/configs/N=3072,K=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json b/vllm/model_executor/layers/quantization/utils/configs/N=3072,K=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json
new file mode 100644
index 0000000000000..dd069726d7ed4
--- /dev/null
+++ b/vllm/model_executor/layers/quantization/utils/configs/N=3072,K=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json	
@@ -0,0 +1,146 @@
+{
+    "1": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "2": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "4": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "8": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "16": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "24": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "32": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "48": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "64": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "96": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "128": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "256": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "512": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "1024": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "1536": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "2048": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "3072": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "4096": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 3
+    }
+}
diff --git a/vllm/model_executor/layers/quantization/utils/configs/N=32768,K=512,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json b/vllm/model_executor/layers/quantization/utils/configs/N=32768,K=512,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json
new file mode 100644
index 0000000000000..56b939e52fac3
--- /dev/null
+++ b/vllm/model_executor/layers/quantization/utils/configs/N=32768,K=512,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json	
@@ -0,0 +1,146 @@
+{
+    "1": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 4
+    },
+    "2": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 8,
+        "num_stages": 4
+    },
+    "4": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 3
+    },
+    "8": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "16": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "24": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "32": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "48": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "64": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "96": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "128": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "256": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "512": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "1024": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "1536": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "2048": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "3072": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "4096": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    }
+}
diff --git a/vllm/model_executor/layers/quantization/utils/configs/N=32768,K=512,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json b/vllm/model_executor/layers/quantization/utils/configs/N=32768,K=512,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json
new file mode 100644
index 0000000000000..63d9a0bf5d79d
--- /dev/null
+++ b/vllm/model_executor/layers/quantization/utils/configs/N=32768,K=512,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json	
@@ -0,0 +1,146 @@
+{
+    "1": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 4
+    },
+    "2": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 8,
+        "num_stages": 4
+    },
+    "4": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 3
+    },
+    "8": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "16": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "24": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "32": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "48": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "64": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "96": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "128": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "256": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "512": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "1024": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "1536": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "2048": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "3072": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "4096": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    }
+}
diff --git a/vllm/model_executor/layers/quantization/utils/configs/N=36864,K=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json b/vllm/model_executor/layers/quantization/utils/configs/N=36864,K=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json
new file mode 100644
index 0000000000000..7fa398c15a2a5
--- /dev/null
+++ b/vllm/model_executor/layers/quantization/utils/configs/N=36864,K=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json	
@@ -0,0 +1,146 @@
+{
+    "1": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "2": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "4": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "8": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "16": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "24": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "32": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "48": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "64": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "96": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "128": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "256": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "512": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "1024": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "1536": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "2048": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "3072": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "4096": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 3
+    }
+}
diff --git a/vllm/model_executor/layers/quantization/utils/configs/N=36864,K=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json b/vllm/model_executor/layers/quantization/utils/configs/N=36864,K=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json
new file mode 100644
index 0000000000000..f15d8f64c7090
--- /dev/null
+++ b/vllm/model_executor/layers/quantization/utils/configs/N=36864,K=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json	
@@ -0,0 +1,146 @@
+{
+    "1": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 8,
+        "num_stages": 3
+    },
+    "2": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "4": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "8": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "16": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "24": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "32": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "48": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "64": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "96": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "128": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "256": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "512": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "1024": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "1536": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "2048": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "3072": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "4096": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 3
+    }
+}
diff --git a/vllm/model_executor/layers/quantization/utils/configs/N=4096,K=512,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json b/vllm/model_executor/layers/quantization/utils/configs/N=4096,K=512,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json
new file mode 100644
index 0000000000000..cd3e07804fdec
--- /dev/null
+++ b/vllm/model_executor/layers/quantization/utils/configs/N=4096,K=512,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json	
@@ -0,0 +1,146 @@
+{
+    "1": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "2": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "4": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "8": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "16": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "24": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "32": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "48": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "64": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "96": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "128": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "256": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "512": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "1024": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "1536": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "2048": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "3072": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "4096": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 3
+    }
+}
diff --git a/vllm/model_executor/layers/quantization/utils/configs/N=4096,K=512,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json b/vllm/model_executor/layers/quantization/utils/configs/N=4096,K=512,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json
new file mode 100644
index 0000000000000..9d5a329d7466a
--- /dev/null
+++ b/vllm/model_executor/layers/quantization/utils/configs/N=4096,K=512,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json	
@@ -0,0 +1,146 @@
+{
+    "1": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "2": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "4": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "8": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "16": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "24": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "32": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "48": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "64": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "96": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "128": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "256": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "512": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "1024": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "1536": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "2048": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "3072": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "4096": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 3
+    }
+}
diff --git a/vllm/model_executor/layers/quantization/utils/configs/N=4608,K=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json b/vllm/model_executor/layers/quantization/utils/configs/N=4608,K=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json
new file mode 100644
index 0000000000000..96e1594a3eabb
--- /dev/null
+++ b/vllm/model_executor/layers/quantization/utils/configs/N=4608,K=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json	
@@ -0,0 +1,146 @@
+{
+    "1": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "2": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "4": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "8": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "16": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "24": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "32": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "48": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "64": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "96": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "128": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "256": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "512": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "1024": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "1536": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "2048": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "3072": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "4096": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 3
+    }
+}
diff --git a/vllm/model_executor/layers/quantization/utils/configs/N=4608,K=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json b/vllm/model_executor/layers/quantization/utils/configs/N=4608,K=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json
new file mode 100644
index 0000000000000..5ffd367df833d
--- /dev/null
+++ b/vllm/model_executor/layers/quantization/utils/configs/N=4608,K=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json	
@@ -0,0 +1,146 @@
+{
+    "1": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "2": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "4": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "8": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "16": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "24": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "32": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "48": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "64": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "96": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "128": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "256": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "512": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "1024": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "1536": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "2048": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "3072": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "4096": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 3
+    }
+}
diff --git a/vllm/model_executor/layers/quantization/utils/configs/N=512,K=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json b/vllm/model_executor/layers/quantization/utils/configs/N=512,K=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json
new file mode 100644
index 0000000000000..eabc423949a24
--- /dev/null
+++ b/vllm/model_executor/layers/quantization/utils/configs/N=512,K=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json	
@@ -0,0 +1,146 @@
+{
+    "1": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "2": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "4": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "8": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "16": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "24": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "32": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "48": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "64": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "96": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "128": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "256": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "512": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "1024": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "1536": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "2048": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "3072": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "4096": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 3
+    }
+}
diff --git a/vllm/model_executor/layers/quantization/utils/configs/N=576,K=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json b/vllm/model_executor/layers/quantization/utils/configs/N=576,K=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json
new file mode 100644
index 0000000000000..51e237b91b8e7
--- /dev/null
+++ b/vllm/model_executor/layers/quantization/utils/configs/N=576,K=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json	
@@ -0,0 +1,146 @@
+{
+    "1": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "2": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "4": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "8": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "16": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "24": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "32": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "48": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "64": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "96": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "128": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "256": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "512": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "1024": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "1536": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "2048": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "3072": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "4096": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 3
+    }
+}
diff --git a/vllm/model_executor/layers/quantization/utils/configs/N=576,K=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json b/vllm/model_executor/layers/quantization/utils/configs/N=576,K=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json
new file mode 100644
index 0000000000000..6280219c9ee7d
--- /dev/null
+++ b/vllm/model_executor/layers/quantization/utils/configs/N=576,K=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json	
@@ -0,0 +1,146 @@
+{
+    "1": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "2": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "4": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "8": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "16": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "24": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "32": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "48": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "64": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "96": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "128": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "256": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "512": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "1024": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "1536": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "2048": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "3072": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "4096": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 3
+    }
+}
diff --git a/vllm/model_executor/layers/quantization/utils/configs/N=7168,K=1024,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json b/vllm/model_executor/layers/quantization/utils/configs/N=7168,K=1024,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json
new file mode 100644
index 0000000000000..40c01c0b92b4b
--- /dev/null
+++ b/vllm/model_executor/layers/quantization/utils/configs/N=7168,K=1024,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json	
@@ -0,0 +1,146 @@
+{
+    "1": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "2": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "4": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "8": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "16": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "24": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "32": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "48": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "64": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "96": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "128": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "256": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "512": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "1024": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "1536": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "2048": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "3072": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "4096": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 3
+    }
+}
diff --git a/vllm/model_executor/layers/quantization/utils/configs/N=7168,K=1024,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json b/vllm/model_executor/layers/quantization/utils/configs/N=7168,K=1024,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json
new file mode 100644
index 0000000000000..c6fd3659799bc
--- /dev/null
+++ b/vllm/model_executor/layers/quantization/utils/configs/N=7168,K=1024,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json	
@@ -0,0 +1,146 @@
+{
+    "1": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "2": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "4": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "8": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "16": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "24": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "32": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "48": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "64": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "96": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "128": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "256": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "512": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "1024": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "1536": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "2048": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "3072": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "4096": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 3
+    }
+}
diff --git a/vllm/model_executor/layers/quantization/utils/configs/N=7168,K=1152,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json b/vllm/model_executor/layers/quantization/utils/configs/N=7168,K=1152,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json
new file mode 100644
index 0000000000000..160f12ed3f95a
--- /dev/null
+++ b/vllm/model_executor/layers/quantization/utils/configs/N=7168,K=1152,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json	
@@ -0,0 +1,146 @@
+{
+    "1": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "2": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "4": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "8": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "16": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "24": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "32": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "48": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "64": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "96": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "128": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "256": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "512": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "1024": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "1536": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "2048": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "3072": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "4096": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    }
+}
diff --git a/vllm/model_executor/layers/quantization/utils/configs/N=7168,K=1152,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json b/vllm/model_executor/layers/quantization/utils/configs/N=7168,K=1152,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json
new file mode 100644
index 0000000000000..e5c4a1d2c94e5
--- /dev/null
+++ b/vllm/model_executor/layers/quantization/utils/configs/N=7168,K=1152,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json	
@@ -0,0 +1,146 @@
+{
+    "1": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "2": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "4": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "8": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "16": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "24": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "32": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "48": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "64": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "96": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "128": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "256": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "512": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "1024": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "1536": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "2048": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "3072": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "4096": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 3
+    }
+}
diff --git a/vllm/model_executor/layers/quantization/utils/configs/N=7168,K=128,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json b/vllm/model_executor/layers/quantization/utils/configs/N=7168,K=128,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json
new file mode 100644
index 0000000000000..2bf5eb27e3820
--- /dev/null
+++ b/vllm/model_executor/layers/quantization/utils/configs/N=7168,K=128,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json	
@@ -0,0 +1,146 @@
+{
+    "1": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "2": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 2
+    },
+    "4": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 8,
+        "num_stages": 2
+    },
+    "8": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 2
+    },
+    "16": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 2
+    },
+    "24": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 2
+    },
+    "32": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 2
+    },
+    "48": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 2
+    },
+    "64": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2
+    },
+    "96": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "128": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2
+    },
+    "256": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 2
+    },
+    "512": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "1024": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2
+    },
+    "1536": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2
+    },
+    "2048": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2
+    },
+    "3072": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2
+    },
+    "4096": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2
+    }
+}
diff --git a/vllm/model_executor/layers/quantization/utils/configs/N=7168,K=16384,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json b/vllm/model_executor/layers/quantization/utils/configs/N=7168,K=16384,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json
new file mode 100644
index 0000000000000..0a1e14cffbb2a
--- /dev/null
+++ b/vllm/model_executor/layers/quantization/utils/configs/N=7168,K=16384,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json	
@@ -0,0 +1,146 @@
+{
+    "1": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 8,
+        "num_stages": 5
+    },
+    "2": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "4": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "8": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "16": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "24": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "32": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "48": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "64": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "96": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "128": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "256": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "512": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "1024": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "1536": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "2048": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "3072": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "4096": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 3
+    }
+}
diff --git a/vllm/model_executor/layers/quantization/utils/configs/N=7168,K=16384,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json b/vllm/model_executor/layers/quantization/utils/configs/N=7168,K=16384,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json
new file mode 100644
index 0000000000000..15b1c93f60fc5
--- /dev/null
+++ b/vllm/model_executor/layers/quantization/utils/configs/N=7168,K=16384,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json	
@@ -0,0 +1,146 @@
+{
+    "1": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "2": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "4": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "8": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "16": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "24": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "32": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "48": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "64": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "96": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "128": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "256": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "512": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "1024": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "1536": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "2048": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "3072": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "4096": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 3
+    }
+}
diff --git a/vllm/model_executor/layers/quantization/utils/configs/N=7168,K=18432,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json b/vllm/model_executor/layers/quantization/utils/configs/N=7168,K=18432,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json
new file mode 100644
index 0000000000000..8ff12e64c172f
--- /dev/null
+++ b/vllm/model_executor/layers/quantization/utils/configs/N=7168,K=18432,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json	
@@ -0,0 +1,146 @@
+{
+    "1": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "2": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "4": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "8": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "16": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "24": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "32": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "48": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "64": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "96": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "128": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "256": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "512": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "1024": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "1536": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "2048": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "3072": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "4096": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 3
+    }
+}
diff --git a/vllm/model_executor/layers/quantization/utils/configs/N=7168,K=18432,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json b/vllm/model_executor/layers/quantization/utils/configs/N=7168,K=18432,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json
new file mode 100644
index 0000000000000..4532f93681e2b
--- /dev/null
+++ b/vllm/model_executor/layers/quantization/utils/configs/N=7168,K=18432,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json	
@@ -0,0 +1,146 @@
+{
+    "1": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "2": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "4": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "8": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "16": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "24": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "32": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "48": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "64": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "96": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "128": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "256": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "512": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "1024": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "1536": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "2048": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "3072": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "4096": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 3
+    }
+}
diff --git a/vllm/model_executor/layers/quantization/utils/configs/N=7168,K=2048,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json b/vllm/model_executor/layers/quantization/utils/configs/N=7168,K=2048,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json
new file mode 100644
index 0000000000000..ca7f32b9552b4
--- /dev/null
+++ b/vllm/model_executor/layers/quantization/utils/configs/N=7168,K=2048,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json	
@@ -0,0 +1,146 @@
+{
+    "1": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 8,
+        "num_stages": 5
+    },
+    "2": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "4": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "8": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "16": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "24": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "32": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "48": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "64": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "96": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "128": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "256": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "512": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "1024": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "1536": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "2048": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "3072": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "4096": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 3
+    }
+}
diff --git a/vllm/model_executor/layers/quantization/utils/configs/N=7168,K=2048,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json b/vllm/model_executor/layers/quantization/utils/configs/N=7168,K=2048,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json
new file mode 100644
index 0000000000000..5acea242cc0ad
--- /dev/null
+++ b/vllm/model_executor/layers/quantization/utils/configs/N=7168,K=2048,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json	
@@ -0,0 +1,146 @@
+{
+    "1": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 4
+    },
+    "2": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "4": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "8": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "16": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "24": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "32": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "48": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "64": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "96": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "128": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "256": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "512": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "1024": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "1536": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "2048": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "3072": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "4096": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    }
+}
diff --git a/vllm/model_executor/layers/quantization/utils/configs/N=7168,K=2304,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json b/vllm/model_executor/layers/quantization/utils/configs/N=7168,K=2304,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json
new file mode 100644
index 0000000000000..58cdd93e90b8c
--- /dev/null
+++ b/vllm/model_executor/layers/quantization/utils/configs/N=7168,K=2304,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json	
@@ -0,0 +1,146 @@
+{
+    "1": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "2": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "4": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "8": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "16": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "24": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "32": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "48": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "64": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "96": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "128": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "256": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "512": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "1024": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "1536": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "2048": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "3072": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "4096": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 3
+    }
+}
diff --git a/vllm/model_executor/layers/quantization/utils/configs/N=7168,K=2304,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json b/vllm/model_executor/layers/quantization/utils/configs/N=7168,K=2304,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json
new file mode 100644
index 0000000000000..b72e0371d1421
--- /dev/null
+++ b/vllm/model_executor/layers/quantization/utils/configs/N=7168,K=2304,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json	
@@ -0,0 +1,146 @@
+{
+    "1": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "2": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "4": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "8": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "16": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "24": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "32": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "48": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "64": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "96": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "128": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "256": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "512": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "1024": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "1536": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "2048": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "3072": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "4096": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    }
+}
diff --git a/vllm/model_executor/layers/quantization/utils/configs/N=7168,K=256,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json b/vllm/model_executor/layers/quantization/utils/configs/N=7168,K=256,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json
new file mode 100644
index 0000000000000..293adce387e06
--- /dev/null
+++ b/vllm/model_executor/layers/quantization/utils/configs/N=7168,K=256,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json	
@@ -0,0 +1,146 @@
+{
+    "1": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "2": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "4": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "8": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "16": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "24": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "32": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "48": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "64": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "96": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "128": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "256": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "512": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "1024": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "1536": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2
+    },
+    "2048": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2
+    },
+    "3072": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2
+    },
+    "4096": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2
+    }
+}
diff --git a/vllm/model_executor/layers/quantization/utils/fp8_utils.py b/vllm/model_executor/layers/quantization/utils/fp8_utils.py
index b6882cc7c837c..29c7268ad9e01 100644
--- a/vllm/model_executor/layers/quantization/utils/fp8_utils.py
+++ b/vllm/model_executor/layers/quantization/utils/fp8_utils.py
@@ -1,12 +1,35 @@
+# SPDX-License-Identifier: Apache-2.0
+
 # Adapted from https://github.com/sgl-project/sglang/pull/2575
-from typing import List, Optional, Tuple
+import functools
+import json
+import os
+from typing import Any, Dict, List, Optional, Tuple, Union
 
 import torch
 import triton
 import triton.language as tl
 
+from vllm import _custom_ops as ops
+from vllm.logger import init_logger
+from vllm.model_executor.layers.quantization.utils.quant_utils import (
+    _normalize_quant_group_shape, scaled_dequantize)
+from vllm.model_executor.layers.quantization.utils.w8a8_utils import (
+    apply_fp8_linear)
 from vllm.platforms import current_platform
 
+logger = init_logger(__name__)
+
+current_platform_fp8_dtype = (torch.float8_e4m3fnuz
+                              if current_platform.is_rocm() else
+                              torch.float8_e4m3fn)
+
+
+def is_fp8(x: Union[torch.dtype, torch.Tensor]) -> bool:
+    if isinstance(x, torch.Tensor):
+        x = x.dtype
+    return x == torch.float8_e4m3fn or x == torch.float8_e4m3fnuz
+
 
 def apply_w8a8_block_fp8_linear(
     input: torch.Tensor,
@@ -15,25 +38,75 @@ def apply_w8a8_block_fp8_linear(
     weight_scale: torch.Tensor,
     input_scale: Optional[torch.Tensor] = None,
     bias: Optional[torch.Tensor] = None,
+    cutlass_block_fp8_supported: bool = True,
 ) -> torch.Tensor:
     assert input_scale is None
     # View input as 2D matrix for fp8 methods
     input_2d = input.view(-1, input.shape[-1])
     output_shape = [*input.shape[:-1], weight.shape[0]]
 
-    q_input, x_scale = per_token_group_quant_fp8(input_2d, block_size[1])
-    output = w8a8_block_fp8_matmul(q_input,
-                                   weight,
-                                   x_scale,
-                                   weight_scale,
-                                   block_size,
-                                   output_dtype=input.dtype)
-
+    shape_supported_by_cutlass = (weight.shape[0] % 128 == 0
+                                  and weight.shape[1] % 128 == 0)
+    if cutlass_block_fp8_supported and shape_supported_by_cutlass:
+        q_input, x_scale = per_token_group_quant_fp8(input_2d,
+                                                     block_size[1],
+                                                     column_major_scales=True)
+        output = ops.cutlass_scaled_mm(q_input,
+                                       weight.T,
+                                       out_dtype=input.dtype,
+                                       scale_a=x_scale,
+                                       scale_b=weight_scale.T)
+    else:
+        q_input, x_scale = per_token_group_quant_fp8(input_2d,
+                                                     block_size[1],
+                                                     column_major_scales=False)
+        output = w8a8_block_fp8_matmul(q_input,
+                                       weight,
+                                       x_scale,
+                                       weight_scale,
+                                       block_size,
+                                       output_dtype=input.dtype)
     if bias is not None:
         output = output + bias
     return output.to(dtype=input.dtype).view(*output_shape)
 
 
+# Unify the interface between `apply_w8a8_block_fp8_linear` and
+# `apply_fp8_linear`
+# NOTE(lucas): this is quite messy, we should think through this more formally
+def apply_fp8_linear_generic(
+        input: torch.Tensor,
+        weight: torch.Tensor,
+        weight_scale: torch.Tensor,
+        input_group_shape: Tuple[int, int],
+        weight_group_shape: Tuple[int, int],
+        input_scale: Optional[torch.Tensor] = None,  # static scale if one
+) -> torch.Tensor:
+    # View input as 2D matrix for fp8 methods
+    input = input.view(-1, input.shape[-1])
+
+    weight_group_shape = _normalize_quant_group_shape(\
+        weight, weight_group_shape)
+    input_group_shape = _normalize_quant_group_shape(input, input_group_shape)
+
+    def is_dim_blocked(dim, shape, group_shape):
+        return group_shape < shape[dim] and group_shape > 1
+
+    if is_dim_blocked(0, weight.shape, weight_group_shape[0])\
+     and is_dim_blocked(1, weight.shape, weight_group_shape[1]) and\
+     input_group_shape == (1, weight_group_shape[1]):
+        return apply_w8a8_block_fp8_linear(input, weight,
+                                           list(weight_group_shape),
+                                           weight_scale)
+    else:
+        # Despite having linear in the it doesn't conform to
+        # `torch.nn.functional.linear` which is defined as `input @ weight.T`
+        # so we explicitly transpose the weight matrix here
+        return apply_fp8_linear(input, weight.T, weight_scale.T,
+                         use_per_token_if_dynamic=\
+                             (input_group_shape == (1, input.shape[1])))
+
+
 def input_to_float8(
         x: torch.Tensor,
         dtype: Optional[torch.dtype] = None
@@ -54,7 +127,6 @@ def input_to_float8(
 def block_quant_to_tensor_quant(
     x_q_block: torch.Tensor,
     x_s: torch.Tensor,
-    block_size: List[int],
 ) -> Tuple[torch.Tensor, torch.Tensor]:
     """This function converts block-wise quantization to tensor-wise
     quantization. The inputs are block-wise quantization tensor `x_q_block`,
@@ -62,25 +134,7 @@ def block_quant_to_tensor_quant(
     The outputs are tensor-wise quantization tensor and tensor-wise
     quantization scale. Note only float8 is supported for now.
     """
-    block_n, block_k = block_size[0], block_size[1]
-    n, k = x_q_block.shape
-    n_tiles = (n + block_n - 1) // block_n
-    k_tiles = (k + block_k - 1) // block_k
-    assert n_tiles == x_s.shape[0]
-    assert k_tiles == x_s.shape[1]
-
-    x_dq_block = x_q_block.to(torch.float32)
-
-    x_dq_block_tiles = [[
-        x_dq_block[j * block_n:min((j + 1) * block_n, n),
-                   i * block_k:min((i + 1) * block_k, k), ]
-        for i in range(k_tiles)
-    ] for j in range(n_tiles)]
-
-    for i in range(k_tiles):
-        for j in range(n_tiles):
-            x_dq_block_tiles[j][i][:, :] = x_dq_block_tiles[j][i] * x_s[j][i]
-
+    x_dq_block = scaled_dequantize(x_q_block, x_s)
     x_q_tensor, scale = input_to_float8(x_dq_block, dtype=x_q_block.dtype)
     return x_q_tensor, scale
 
@@ -91,10 +145,7 @@ def _per_token_group_quant_fp8(
     y_ptr,
     y_q_ptr,
     y_s_ptr,
-    # Stride of input
-    y_stride,
-    # Columns of input
-    N,
+    group_size,
     # Avoid to divide zero
     eps,
     # Information for float8
@@ -109,12 +160,60 @@ def _per_token_group_quant_fp8(
     """
     # Map the program id to the row of X and Y it should compute.
     g_id = tl.program_id(0)
-    y_ptr += g_id * y_stride
-    y_q_ptr += g_id * y_stride
+    y_ptr += g_id * group_size
+    y_q_ptr += g_id * group_size
     y_s_ptr += g_id
 
     cols = tl.arange(0, BLOCK)  # N <= BLOCK
-    mask = cols < N
+    mask = cols < group_size
+
+    y = tl.load(y_ptr + cols, mask=mask, other=0.0).to(tl.float32)
+    # Quant
+    _absmax = tl.maximum(tl.max(tl.abs(y)), eps)
+    y_s = _absmax / fp8_max
+    y_q = tl.clamp(y / y_s, fp8_min, fp8_max).to(y_q_ptr.dtype.element_ty)
+
+    tl.store(y_q_ptr + cols, y_q, mask=mask)
+    tl.store(y_s_ptr, y_s)
+
+
+@triton.jit
+def _per_token_group_quant_fp8_colmajor(
+    # Pointers to inputs and output
+    y_ptr,
+    y_q_ptr,
+    y_s_ptr,
+    group_size,
+    # Num columns of y
+    y_num_columns,
+    # Stride from one column to the next of y_s
+    y_s_col_stride,
+    # Avoid to divide zero
+    eps,
+    # Information for float8
+    fp8_min,
+    fp8_max,
+    # Meta-parameters
+    BLOCK: tl.constexpr,
+):
+    """A Triton-accelerated function to perform per-token-group
+    quantization on a tensor.
+    This function converts the tensor values into float8 values.
+    """
+    # Map the program id to the row of X and Y it should compute.
+    g_id = tl.program_id(0)
+    y_ptr += g_id * group_size
+    y_q_ptr += g_id * group_size
+
+    # Convert g_id the flattened block coordinate to 2D so we can index
+    # into the output y_scales matrix
+    blocks_per_row = y_num_columns // group_size
+    scale_col = g_id % blocks_per_row
+    scale_row = g_id // blocks_per_row
+    y_s_ptr += scale_col * y_s_col_stride + scale_row
+
+    cols = tl.arange(0, BLOCK)  # group_size <= BLOCK
+    mask = cols < group_size
 
     y = tl.load(y_ptr + cols, mask=mask, other=0.0).to(tl.float32)
     # Quant
@@ -131,12 +230,13 @@ def per_token_group_quant_fp8(
     group_size: int,
     eps: float = 1e-10,
     dtype: Optional[torch.dtype] = None,
+    column_major_scales: bool = False,
 ) -> Tuple[torch.Tensor, torch.Tensor]:
     """Function to perform per-token-group quantization on an input tensor `x`.
     It converts the tensor values into signed float8 values and returns the
     quantized tensor along with the scaling factor used for quantization.
     Args:
-        x: The input tenosr with ndim >= 2.
+        x: The input tensor with ndim >= 2.
         group_size: The group size used for quantization.
         eps: The minimum to avoid dividing zero.
         dtype: The dype of output tensor. Note that only `torch.float8_e4m3fn`
@@ -160,29 +260,46 @@ def per_token_group_quant_fp8(
     x_q = torch.empty_like(x, device=x.device, dtype=dtype)
     M = x.numel() // group_size
     N = group_size
-    x_s = torch.empty(
-        x.shape[:-1] + (x.shape[-1] // group_size, ),
-        device=x.device,
-        dtype=torch.float32,
-    )
+    if column_major_scales:
+        shape = (x.shape[-1] // group_size, ) + x.shape[:-1]
+        x_s = torch.empty(shape, device=x.device,
+                          dtype=torch.float32).permute(-1, -2)
+    else:
+        shape = x.shape[:-1] + (x.shape[-1] // group_size, )
+        x_s = torch.empty(shape, device=x.device, dtype=torch.float32)
 
     BLOCK = triton.next_power_of_2(N)
     # heuristics for number of warps
     num_warps = min(max(BLOCK // 256, 1), 8)
     num_stages = 1
-    _per_token_group_quant_fp8[(M, )](
-        x,
-        x_q,
-        x_s,
-        group_size,
-        N,
-        eps,
-        fp8_min=fp8_min,
-        fp8_max=fp8_max,
-        BLOCK=BLOCK,
-        num_warps=num_warps,
-        num_stages=num_stages,
-    )
+    if column_major_scales:
+        _per_token_group_quant_fp8_colmajor[(M, )](
+            x,
+            x_q,
+            x_s,
+            group_size,
+            x.shape[1],
+            x_s.stride(1),
+            eps,
+            fp8_min=fp8_min,
+            fp8_max=fp8_max,
+            BLOCK=BLOCK,
+            num_warps=num_warps,
+            num_stages=num_stages,
+        )
+    else:
+        _per_token_group_quant_fp8[(M, )](
+            x,
+            x_q,
+            x_s,
+            group_size,
+            eps,
+            fp8_min=fp8_min,
+            fp8_max=fp8_max,
+            BLOCK=BLOCK,
+            num_warps=num_warps,
+            num_stages=num_stages,
+        )
 
     return x_q, x_s
 
@@ -276,6 +393,43 @@ def _w8a8_block_fp8_matmul(
     tl.store(c_ptrs, c, mask=c_mask)
 
 
+@functools.lru_cache
+def get_w8a8_block_fp8_configs(N: int, K: int, block_n: int,
+                               block_k: int) -> Optional[Dict[int, Any]]:
+    """
+    Return optimized configurations for the w8a8 block fp8 kernel.
+    The return value will be a dictionary that maps an irregular grid of
+    batch sizes to configurations of the w8a8 block fp8 kernel. To evaluate the
+    kernel on a given batch size bs, the closest batch size in the grid should
+    be picked and the associated configuration chosen to invoke the kernel.
+    """
+
+    # First look up if an optimized configuration is available in the configs
+    # directory
+    device_name = current_platform.get_device_name().replace(" ", "_")
+    json_file_name = f"N={N},K={K},device_name={device_name},dtype=fp8_w8a8,block_shape=[{block_n}, {block_k}].json"  # noqa: E501
+
+    config_file_path = os.path.join(
+        os.path.dirname(os.path.realpath(__file__)), "configs", json_file_name)
+    if os.path.exists(config_file_path):
+        with open(config_file_path) as f:
+            logger.info(
+                "Using configuration from %s for W8A8 Block FP8 kernel.",
+                config_file_path,
+            )
+            # If a configuration has been found, return it
+            return {int(key): val for key, val in json.load(f).items()}
+
+    # If no optimized configuration is available, we will use the default
+    # configuration
+    logger.warning(
+        "Using default W8A8 Block FP8 kernel config. Performance might "
+        "be sub-optimal! Config file not found at %s",
+        config_file_path,
+    )
+    return None
+
+
 def w8a8_block_fp8_matmul(
     A: torch.Tensor,
     B: torch.Tensor,
@@ -315,17 +469,22 @@ def w8a8_block_fp8_matmul(
     C_shape = A.shape[:-1] + (N, )
     C = A.new_empty(C_shape, dtype=output_dtype)
 
-    # TODO:
-    # BLOCK_SIZE_M, BLOCK_SIZE_K, BLOCK_SIZE_N can be optimized.
-    # BLOCK_SIZE_K must be divisible by block_k
-    # BLOCK_SIZE_N and BLOCK_SIZE_M has no requirements
-    BLOCK_SIZE_M = 128
-    if M < BLOCK_SIZE_M:
-        BLOCK_SIZE_M = triton.next_power_of_2(M)
-        BLOCK_SIZE_M = max(BLOCK_SIZE_M, 16)
-    BLOCK_SIZE_K = block_k
-    assert block_k % BLOCK_SIZE_K == 0
-    BLOCK_SIZE_N = block_n
+    configs = get_w8a8_block_fp8_configs(N, K, block_size[0], block_size[1])
+    if configs:
+        # Get the optimal config if there is one
+        config = configs[min(configs.keys(), key=lambda x: abs(x - M))]
+    else:
+        # Default config
+        # Block-wise quant: BLOCK_SIZE_N must be divisible by block_size[0]
+        # BLOCK_SIZE_K must be divisible by block_size[1]
+        config = {
+            "BLOCK_SIZE_M": 64,
+            "BLOCK_SIZE_N": block_size[0],
+            "BLOCK_SIZE_K": block_size[1],
+            "GROUP_SIZE_M": 32,
+            "num_warps": 4,
+            "num_stages": 2,
+        }
 
     def grid(META):
         return (triton.cdiv(M, META["BLOCK_SIZE_M"]) *
@@ -352,10 +511,7 @@ def w8a8_block_fp8_matmul(
         As.stride(-1),
         Bs.stride(1),
         Bs.stride(0),
-        BLOCK_SIZE_M=BLOCK_SIZE_M,
-        BLOCK_SIZE_N=BLOCK_SIZE_N,
-        BLOCK_SIZE_K=BLOCK_SIZE_K,
-        GROUP_SIZE_M=8,
+        **config,
     )
 
     return C
diff --git a/vllm/model_executor/layers/quantization/utils/layer_utils.py b/vllm/model_executor/layers/quantization/utils/layer_utils.py
index edce6d19b6c49..5acae7ca3b84f 100644
--- a/vllm/model_executor/layers/quantization/utils/layer_utils.py
+++ b/vllm/model_executor/layers/quantization/utils/layer_utils.py
@@ -1,3 +1,5 @@
+# SPDX-License-Identifier: Apache-2.0
+
 from typing import Union
 
 import torch
diff --git a/vllm/model_executor/layers/quantization/utils/machete_utils.py b/vllm/model_executor/layers/quantization/utils/machete_utils.py
index 18e1332050cdd..cb7d49ed6f1ca 100644
--- a/vllm/model_executor/layers/quantization/utils/machete_utils.py
+++ b/vllm/model_executor/layers/quantization/utils/machete_utils.py
@@ -1,3 +1,5 @@
+# SPDX-License-Identifier: Apache-2.0
+
 from typing import List, Optional, Tuple
 
 import torch
diff --git a/vllm/model_executor/layers/quantization/utils/marlin_utils.py b/vllm/model_executor/layers/quantization/utils/marlin_utils.py
index c9366ca97d149..3beba30832441 100644
--- a/vllm/model_executor/layers/quantization/utils/marlin_utils.py
+++ b/vllm/model_executor/layers/quantization/utils/marlin_utils.py
@@ -1,3 +1,5 @@
+# SPDX-License-Identifier: Apache-2.0
+
 from typing import List, Optional, Tuple
 
 import numpy
diff --git a/vllm/model_executor/layers/quantization/utils/marlin_utils_fp8.py b/vllm/model_executor/layers/quantization/utils/marlin_utils_fp8.py
index 245fe9238e421..6120a8e66aef4 100644
--- a/vllm/model_executor/layers/quantization/utils/marlin_utils_fp8.py
+++ b/vllm/model_executor/layers/quantization/utils/marlin_utils_fp8.py
@@ -1,3 +1,5 @@
+# SPDX-License-Identifier: Apache-2.0
+
 from typing import Optional
 
 import torch
diff --git a/vllm/model_executor/layers/quantization/utils/marlin_utils_test.py b/vllm/model_executor/layers/quantization/utils/marlin_utils_test.py
index 4a06c5d63d52d..fb557a31393ca 100644
--- a/vllm/model_executor/layers/quantization/utils/marlin_utils_test.py
+++ b/vllm/model_executor/layers/quantization/utils/marlin_utils_test.py
@@ -1,3 +1,4 @@
+# SPDX-License-Identifier: Apache-2.0
 """Utility functions used for tests and benchmarks"""
 
 from typing import List, Optional
diff --git a/vllm/model_executor/layers/quantization/utils/marlin_utils_test_24.py b/vllm/model_executor/layers/quantization/utils/marlin_utils_test_24.py
index 17d09055b1eac..3654268e27af3 100644
--- a/vllm/model_executor/layers/quantization/utils/marlin_utils_test_24.py
+++ b/vllm/model_executor/layers/quantization/utils/marlin_utils_test_24.py
@@ -1,3 +1,4 @@
+# SPDX-License-Identifier: Apache-2.0
 """Utility functions used for tests and benchmarks"""
 
 import random
diff --git a/vllm/model_executor/layers/quantization/utils/marlin_utils_test_qqq.py b/vllm/model_executor/layers/quantization/utils/marlin_utils_test_qqq.py
index cb58eb9458363..176b2947ab09e 100644
--- a/vllm/model_executor/layers/quantization/utils/marlin_utils_test_qqq.py
+++ b/vllm/model_executor/layers/quantization/utils/marlin_utils_test_qqq.py
@@ -1,3 +1,5 @@
+# SPDX-License-Identifier: Apache-2.0
+
 from typing import List
 
 import numpy
diff --git a/vllm/model_executor/layers/quantization/utils/quant_utils.py b/vllm/model_executor/layers/quantization/utils/quant_utils.py
index 83055d6000d83..62484f62f6187 100644
--- a/vllm/model_executor/layers/quantization/utils/quant_utils.py
+++ b/vllm/model_executor/layers/quantization/utils/quant_utils.py
@@ -1,5 +1,6 @@
+# SPDX-License-Identifier: Apache-2.0
 """This file is used for /tests and /benchmarks"""
-from typing import List, Optional
+from typing import List, Optional, Tuple
 
 import numpy
 import torch
@@ -20,6 +21,120 @@ FUSED_LAYER_NAME_MAPPING = {
 }
 
 
+# Normalize the group_shape to the full extent for any dims that are -1
+def _normalize_quant_group_shape(x: torch.Tensor, group_shape: Tuple[int,
+                                                                     int]):
+    # -1 means full extent
+    return (group_shape[0] if group_shape[0] > 0 else x.shape[-2],
+            group_shape[1] if group_shape[1] > 0 else x.shape[-1])
+
+
+# Useful when treating N-dimensional group scaling as extended numpy-style
+# broadcasting in numpy simply stretches dimensions with an extent of 1 to match
+# the target shape by repeating the data along that dimension (broadcasting)
+# , we extend these semantics to say if the extent of a dimension in the
+# source shape is not 1 and does not match the target shape we repeat each
+# element along that dimension src_shape[dim] // target_shape[dim] times
+# example if we have:
+#       a = [[1, 2], and target_shape = (2, 4)
+#            [3, 4]]
+# then we would expand a to:
+#       a = [[1, 1, 2, 2],
+#            [3, 3, 4, 4]]
+# NOTE this function this function does not explicitly broadcast dimensions
+# with an extent of 1, since this can be done implicitly by pytorch
+def group_broadcast(t, shape):
+    for i, s in enumerate(shape):
+        if t.shape[i] != s and t.shape[i] != 1:
+            assert s % t.shape[i] == 0
+            t = t.unsqueeze(i + 1)\
+                .expand(*t.shape[:i+1], s // t.shape[i], *t.shape[i+1:])\
+                .flatten(i, i + 1)
+    return t
+
+
+# Quantize assuming once scale per group of elements with shape group_shape,
+# example group shapes:
+#  * (-1, -1)   for per-tensor quantization
+#  * (1, -1)    for per-row quantization
+#  * (-1, 1)    for per-column quantization
+#  * (128, 128) for 128x128 deepseek style block quantization
+#  * (1, 128)   for deepseek style activation quantization
+#               (i.e. per-token-per-group)
+def scaled_quantize(
+    x: torch.Tensor,
+    group_shape: Tuple[int, int],
+    quant_dtype: torch.dtype,
+) -> Tuple[torch.Tensor, torch.Tensor]:
+    group_shape = _normalize_quant_group_shape(x, group_shape)
+    assert quant_dtype.is_floating_point, \
+        "currently `scaled_quantize` only supports floating point dtypes " \
+        "but could be extended to support other dtypes"
+
+    finfo = torch.finfo(quant_dtype)
+
+    # Reshape (M, N) into (BLK_M, BLOCK_SIZE_M, BLK_N, BLOCK_SIZE_N)
+    assert x.ndim == 2
+    assert x.shape[0] % group_shape[0] == 0 and x.shape[1] % group_shape[1] == 0
+    blk_m, blk_n = x.shape[0] // group_shape[0], x.shape[1] // group_shape[1]
+    x_blkd = x.reshape(blk_m, group_shape[0], blk_n, group_shape[1])
+
+    # Permute to (BLK_M, BLK_N, BLOCK_SIZE_M, BLOCK_SIZE_N)
+    x_blkd_permd = x_blkd.permute(0, 2, 1, 3)
+    # Flatten to (BLK_M, BLK_N, BLOCK_SIZE_M * BLOCK_SIZE_N)
+    x_blkd_permd = x_blkd_permd.flatten(start_dim=2)
+
+    # Compute scales
+    min_val, max_val = x_blkd_permd.aminmax(dim=-1)
+    amax = torch.maximum(min_val.abs(), max_val.abs()).clamp(min=1e-12)
+    scale = finfo.max / amax
+
+    # Apply scale and convert form:
+    # (BLK_M, BLK_N, BLOCK_SIZE_M * BLOCK_SIZE_N) to (M, N)
+    x_scl_sat = (x_blkd_permd * scale.unsqueeze(-1))\
+        .clamp(min=finfo.min, max=finfo.max)\
+        .reshape(blk_m, blk_n, group_shape[0], group_shape[1])\
+        .permute(0, 2, 1, 3)\
+        .reshape(x.shape)
+
+    return x_scl_sat.to(quant_dtype).contiguous(), scale.float().reciprocal()
+
+
+# inverses `scaled_quantize`
+def scaled_dequantize(
+    x_q: torch.Tensor,
+    x_s: torch.Tensor,
+    group_shape: Optional[Tuple[int, int]] = None,
+    out_dtype: torch.dtype = torch.float32,
+) -> Tuple[torch.Tensor, torch.Tensor]:
+    if group_shape is not None:
+        group_shape = _normalize_quant_group_shape(x_q, group_shape)
+
+    if x_s.ndim == 0:  # scalar
+        x_s = x_s.unsqueeze(-1).unsqueeze(-1)  # convert to (1, 1) tensor
+    if x_s.ndim == 1:
+        if group_shape is None:
+            raise AssertionError(
+                "if x_s is 1D tensor, group_shape must be provided otherwise "
+                "its ambiguous which dimension to broadcast x_s to")
+        # unsqueeze the scales for the dimension where we want to broadcast
+        # across the full extent
+        if group_shape[0] == x_q.shape[-2]:
+            x_s = x_s.unsqueeze(-2)
+        elif group_shape[1] == x_q.shape[-1]:
+            x_s = x_s.unsqueeze(-1)
+        else:
+            raise AssertionError(
+                "if x_s is a vector we should be broadcasting it to the full "
+                "extent of one of the dimensions")
+
+    if group_shape is not None:
+        assert x_s.shape[-1] == x_q.shape[-1] // group_shape[1]
+        assert x_s.shape[-2] == x_q.shape[-2] // group_shape[0]
+    x_s = group_broadcast(x_s.to(torch.float32), x_q.shape)
+    return (x_q.to(torch.float32) * x_s).to(out_dtype)
+
+
 def pack_quantized_values_into_int32(w_q: torch.Tensor,
                                      wtype: ScalarType,
                                      packed_dim: int = 0):
diff --git a/vllm/model_executor/layers/quantization/utils/w8a8_utils.py b/vllm/model_executor/layers/quantization/utils/w8a8_utils.py
index 7cdce67cf1677..3fd88e8754a59 100644
--- a/vllm/model_executor/layers/quantization/utils/w8a8_utils.py
+++ b/vllm/model_executor/layers/quantization/utils/w8a8_utils.py
@@ -1,3 +1,5 @@
+# SPDX-License-Identifier: Apache-2.0
+
 from typing import List, Optional, Tuple, Union
 
 import torch
@@ -30,6 +32,16 @@ def cutlass_fp8_supported() -> bool:
     return ops.cutlass_scaled_mm_supports_fp8(capability)
 
 
+def cutlass_block_fp8_supported() -> bool:
+    if not current_platform.is_cuda():
+        return False
+
+    capability_tuple = current_platform.get_device_capability()
+    capability = -1 if capability_tuple is None else capability_tuple.to_int()
+
+    return ops.cutlass_scaled_mm_supports_block_fp8(capability)
+
+
 def per_tensor_dequantize(
         tensor: torch.Tensor, inv_scale: Union[float,
                                                torch.Tensor]) -> torch.Tensor:
@@ -73,8 +85,8 @@ def requantize_with_max_scale(
     # from disk in this case. Skip requantization in this case (since)
     # we already are quantized with the single scale.
     # * Sample Model: nm-testing/Phi-3-mini-128k-instruct-FP8
-    unfused_module_in_checkpoint = (weight_scale[-1] > torch.finfo(
-        torch.float8_e4m3fn).min)
+    unfused_module_in_checkpoint = (weight_scale[-1]
+                                    > torch.finfo(torch.float8_e4m3fn).min)
 
     # If unfused checkpoint, need requanize with the single scale.
     if unfused_module_in_checkpoint:
diff --git a/vllm/model_executor/layers/rejection_sampler.py b/vllm/model_executor/layers/rejection_sampler.py
index 9d6c3797c62fc..62e27b714866a 100644
--- a/vllm/model_executor/layers/rejection_sampler.py
+++ b/vllm/model_executor/layers/rejection_sampler.py
@@ -1,3 +1,5 @@
+# SPDX-License-Identifier: Apache-2.0
+
 from functools import cached_property
 from importlib.util import find_spec
 from typing import Dict, Optional, Tuple
diff --git a/vllm/model_executor/layers/resampler.py b/vllm/model_executor/layers/resampler.py
index a67713c320b86..4c9860006c328 100644
--- a/vllm/model_executor/layers/resampler.py
+++ b/vllm/model_executor/layers/resampler.py
@@ -1,3 +1,5 @@
+# SPDX-License-Identifier: Apache-2.0
+
 # Adapted from
 # https://github.com/huggingface/transformers/blob/v4.28.0/src/transformers/models/llama/modeling_llama.py
 # https://huggingface.co/Qwen/Qwen-7B/blob/main/modeling_qwen.py
diff --git a/vllm/model_executor/layers/rotary_embedding.py b/vllm/model_executor/layers/rotary_embedding.py
index d071cfe888f05..814c3b7d9cd83 100644
--- a/vllm/model_executor/layers/rotary_embedding.py
+++ b/vllm/model_executor/layers/rotary_embedding.py
@@ -1,3 +1,5 @@
+# SPDX-License-Identifier: Apache-2.0
+
 # Adapted from
 # https://github.com/huggingface/transformers/blob/v4.33.2/src/transformers/models/llama/modeling_llama.py
 # Copyright 2023 The vLLM team.
diff --git a/vllm/model_executor/layers/sampler.py b/vllm/model_executor/layers/sampler.py
index c2d12c466ba45..6af734be5e987 100644
--- a/vllm/model_executor/layers/sampler.py
+++ b/vllm/model_executor/layers/sampler.py
@@ -1,3 +1,4 @@
+# SPDX-License-Identifier: Apache-2.0
 """A layer that samples the next tokens from the model's outputs."""
 import itertools
 import warnings
@@ -716,9 +717,10 @@ def _sample_with_torch(
       tensors required for Pythonization
     '''
 
-    categorized_seq_group_ids: Dict[SamplingType,
-                                    List[int]] = {t: []
-                                                  for t in SamplingType}
+    categorized_seq_group_ids: Dict[SamplingType, List[int]] = {
+        t: []
+        for t in SamplingType
+    }
     categorized_sample_indices = sampling_metadata.categorized_sample_indices
     for i, seq_group in enumerate(sampling_metadata.seq_groups):
         sampling_params = seq_group.sampling_params
diff --git a/vllm/model_executor/layers/spec_decode_base_sampler.py b/vllm/model_executor/layers/spec_decode_base_sampler.py
index 6aa4b8bd34cde..35c7ffec271e5 100644
--- a/vllm/model_executor/layers/spec_decode_base_sampler.py
+++ b/vllm/model_executor/layers/spec_decode_base_sampler.py
@@ -1,3 +1,5 @@
+# SPDX-License-Identifier: Apache-2.0
+
 from abc import abstractmethod
 from typing import Dict, Optional, Union
 
diff --git a/vllm/model_executor/layers/typical_acceptance_sampler.py b/vllm/model_executor/layers/typical_acceptance_sampler.py
index 584cf971d9c05..95362c280b43b 100644
--- a/vllm/model_executor/layers/typical_acceptance_sampler.py
+++ b/vllm/model_executor/layers/typical_acceptance_sampler.py
@@ -1,3 +1,5 @@
+# SPDX-License-Identifier: Apache-2.0
+
 import torch
 import torch.jit
 
diff --git a/vllm/model_executor/layers/utils.py b/vllm/model_executor/layers/utils.py
index f6f34cd49d953..dfe71028c1bc7 100644
--- a/vllm/model_executor/layers/utils.py
+++ b/vllm/model_executor/layers/utils.py
@@ -1,3 +1,4 @@
+# SPDX-License-Identifier: Apache-2.0
 """Utility methods for model layers."""
 from typing import Tuple
 
diff --git a/vllm/model_executor/layers/vocab_parallel_embedding.py b/vllm/model_executor/layers/vocab_parallel_embedding.py
index 3eb5c39ccf580..e409094dd535d 100644
--- a/vllm/model_executor/layers/vocab_parallel_embedding.py
+++ b/vllm/model_executor/layers/vocab_parallel_embedding.py
@@ -1,3 +1,5 @@
+# SPDX-License-Identifier: Apache-2.0
+
 from dataclasses import dataclass
 from typing import List, Optional, Sequence, Tuple
 
@@ -115,17 +117,17 @@ class VocabParallelEmbeddingShardIndices:
 
     def __post_init__(self):
         # sanity checks
-        assert (self.padded_org_vocab_start_index <=
-                self.padded_org_vocab_end_index)
-        assert (self.padded_added_vocab_start_index <=
-                self.padded_added_vocab_end_index)
+        assert (self.padded_org_vocab_start_index
+                <= self.padded_org_vocab_end_index)
+        assert (self.padded_added_vocab_start_index
+                <= self.padded_added_vocab_end_index)
 
         assert self.org_vocab_start_index <= self.org_vocab_end_index
         assert self.added_vocab_start_index <= self.added_vocab_end_index
 
         assert self.org_vocab_start_index <= self.padded_org_vocab_start_index
-        assert (self.added_vocab_start_index <=
-                self.padded_added_vocab_start_index)
+        assert (self.added_vocab_start_index
+                <= self.padded_added_vocab_start_index)
         assert self.org_vocab_end_index <= self.padded_org_vocab_end_index
         assert self.added_vocab_end_index <= self.padded_added_vocab_end_index
 
@@ -141,8 +143,8 @@ def get_masked_input_and_mask(
         added_vocab_end_index: int) -> Tuple[torch.Tensor, torch.Tensor]:
     # torch.compile will fuse all of the pointwise ops below
     # into a single kernel, making it very fast
-    org_vocab_mask = (input_ >= org_vocab_start_index) & (input_ <
-                                                          org_vocab_end_index)
+    org_vocab_mask = (input_ >= org_vocab_start_index) & (
+        input_ < org_vocab_end_index)
     added_vocab_mask = (input_ >= added_vocab_start_index) & (
         input_ < added_vocab_end_index)
     added_offset = added_vocab_start_index - (
diff --git a/vllm/model_executor/model_loader/__init__.py b/vllm/model_executor/model_loader/__init__.py
index 12468997e4653..9048c70c7a714 100644
--- a/vllm/model_executor/model_loader/__init__.py
+++ b/vllm/model_executor/model_loader/__init__.py
@@ -1,3 +1,5 @@
+# SPDX-License-Identifier: Apache-2.0
+
 from torch import nn
 
 from vllm.config import VllmConfig
diff --git a/vllm/model_executor/model_loader/loader.py b/vllm/model_executor/model_loader/loader.py
index e9779878710ee..809af81d707a8 100644
--- a/vllm/model_executor/model_loader/loader.py
+++ b/vllm/model_executor/model_loader/loader.py
@@ -1,3 +1,5 @@
+# SPDX-License-Identifier: Apache-2.0
+
 # ruff: noqa: SIM117
 import collections
 import copy
@@ -23,6 +25,7 @@ from torch import nn
 from transformers import AutoModelForCausalLM
 from transformers.utils import SAFE_WEIGHTS_INDEX_NAME
 
+from vllm.attention import Attention
 from vllm.config import (LoadConfig, LoadFormat, ModelConfig, ParallelConfig,
                          VllmConfig, set_current_vllm_config)
 from vllm.distributed import (get_tensor_model_parallel_rank,
@@ -114,7 +117,7 @@ def _initialize_model(
     all_params = [param.name for param in signatures.parameters.values()]
     if "vllm_config" in all_params and "prefix" in all_params:
         # new-style model class
-        with set_current_vllm_config(vllm_config):
+        with set_current_vllm_config(vllm_config, check_compile=True):
             return model_class(vllm_config=vllm_config, prefix=prefix)
 
     msg = ("vLLM model class should accept `vllm_config` and `prefix` as "
@@ -142,7 +145,7 @@ def _initialize_model(
         kwargs["lora_config"] = vllm_config.lora_config
     if "scheduler_config" in all_params:
         kwargs["scheduler_config"] = vllm_config.scheduler_config
-    with set_current_vllm_config(vllm_config):
+    with set_current_vllm_config(vllm_config, check_compile=True):
         return model_class(**kwargs)
 
 
@@ -397,6 +400,13 @@ class DefaultModelLoader(BaseModelLoader):
                     # parameters onto device for processing and back off after.
                     with device_loading_context(module, target_device):
                         quant_method.process_weights_after_loading(module)
+                if isinstance(module, Attention) and \
+                    hasattr(module, "process_weights_after_loading"):
+                    # When attention modules need to process weights after
+                    # currently only used by MLA
+                    # TODO(lucas): see if there is a way to unify the signatures
+                    # of process_weights_after_loading
+                    module.process_weights_after_loading(model_config.dtype)
         return model.eval()
 
 
@@ -433,6 +443,11 @@ class DummyModelLoader(BaseModelLoader):
                     with device_loading_context(
                             module, torch.device(device_config.device)):
                         quant_method.process_weights_after_loading(module)
+                if isinstance(module, Attention) and \
+                    hasattr(module, "process_weights_after_loading"):
+                    # When attention modules need to process weights after
+                    # currently only used by MLA
+                    module.process_weights_after_loading(model_config.dtype)
         return model.eval()
 
 
@@ -627,6 +642,12 @@ class ShardedStateLoader(BaseModelLoader):
                     quant_method = getattr(module, "quant_method", None)
                     if quant_method is not None:
                         quant_method.process_weights_after_loading(module)
+                    if isinstance(module, Attention) and \
+                        hasattr(module, "process_weights_after_loading"):
+                        # When attention modules need to process weights after
+                        # currently only used by MLA
+                        module.process_weights_after_loading(
+                            model_config.dtype)
             rank = get_tensor_model_parallel_rank()
             pattern = os.path.join(
                 local_model_path,
@@ -1121,8 +1142,9 @@ class BitsAndBytesModelLoader(BaseModelLoader):
                 # from being incorrectly identified as being present in
                 # 'vpm.encoder.layers.0.self_attn.qkv_proj.weight
                 shard_pos = quant_param_name.find(shard_name)
-                can_correct_rename = (shard_pos > 0) and (
-                    quant_param_name[shard_pos - 1] == ".")
+                can_correct_rename = (shard_pos
+                                      > 0) and (quant_param_name[shard_pos - 1]
+                                                == ".")
                 # If the quant_param_name is packed, it won't occur in the
                 # param_dict before renaming.
                 new_quant_param_name = quant_param_name.replace(
@@ -1265,7 +1287,7 @@ class GGUFModelLoader(BaseModelLoader):
 
 class RunaiModelStreamerLoader(BaseModelLoader):
     """
-        Model loader that can load safetensors 
+        Model loader that can load safetensors
         files from local FS or S3 bucket.
     """
 
@@ -1362,6 +1384,11 @@ class RunaiModelStreamerLoader(BaseModelLoader):
                 if quant_method is not None:
                     with device_loading_context(module, target_device):
                         quant_method.process_weights_after_loading(module)
+                if isinstance(module, Attention) and \
+                    hasattr(module, "process_weights_after_loading"):
+                    # When attention modules need to process weights after
+                    # currently only used by MLA
+                    module.process_weights_after_loading(model_config.dtype)
         return model.eval()
 
 
diff --git a/vllm/model_executor/model_loader/neuron.py b/vllm/model_executor/model_loader/neuron.py
index a90fbd648def9..d900fb3a7d397 100644
--- a/vllm/model_executor/model_loader/neuron.py
+++ b/vllm/model_executor/model_loader/neuron.py
@@ -1,3 +1,4 @@
+# SPDX-License-Identifier: Apache-2.0
 """Utilities for selecting and loading neuron models."""
 import copy
 import importlib
diff --git a/vllm/model_executor/model_loader/openvino.py b/vllm/model_executor/model_loader/openvino.py
index e6299295c85a2..7bd531c568f5e 100644
--- a/vllm/model_executor/model_loader/openvino.py
+++ b/vllm/model_executor/model_loader/openvino.py
@@ -1,3 +1,5 @@
+# SPDX-License-Identifier: Apache-2.0
+
 # ruff: noqa: SIM117
 from pathlib import Path
 from typing import List, Optional, Tuple
diff --git a/vllm/model_executor/model_loader/tensorizer.py b/vllm/model_executor/model_loader/tensorizer.py
index 5b4757072353f..117251ccf05f1 100644
--- a/vllm/model_executor/model_loader/tensorizer.py
+++ b/vllm/model_executor/model_loader/tensorizer.py
@@ -1,3 +1,5 @@
+# SPDX-License-Identifier: Apache-2.0
+
 import argparse
 import dataclasses
 import io
@@ -288,7 +290,8 @@ class TensorizerAgent:
         model_args.torch_dtype = self.tensorizer_config.dtype
         assert self.tensorizer_config.model_class is not None
         # TODO: Do we need to consider old-style model class?
-        with no_init_or_tensor(), set_current_vllm_config(self.vllm_config):
+        with no_init_or_tensor(), set_current_vllm_config(self.vllm_config,
+                                                          check_compile=True):
             return self.tensorizer_config.model_class(
                 vllm_config=self.vllm_config, )
 
@@ -297,8 +300,8 @@ class TensorizerAgent:
         to allow for adapter added tokens."""
         for child in self.model.modules():
             if (isinstance(child, VocabParallelEmbedding)
-                    and child.weight.shape[0] <
-                    child.num_embeddings_per_partition):
+                    and child.weight.shape[0]
+                    < child.num_embeddings_per_partition):
                 new_weight = torch.empty(child.num_embeddings_per_partition,
                                          child.embedding_dim,
                                          dtype=child.weight.dtype,
diff --git a/vllm/model_executor/model_loader/utils.py b/vllm/model_executor/model_loader/utils.py
index 3f923d2f6632a..eb334c1fdf255 100644
--- a/vllm/model_executor/model_loader/utils.py
+++ b/vllm/model_executor/model_loader/utils.py
@@ -1,17 +1,23 @@
+# SPDX-License-Identifier: Apache-2.0
 """Utilities for selecting and loading models."""
 import contextlib
 from dataclasses import dataclass, field
-from typing import Dict, List, Tuple, Type
+from typing import Dict, List, Optional, Tuple, Type
 
 import torch
+import transformers
 from torch import nn
+from transformers.dynamic_module_utils import get_class_from_dynamic_module
 
-from vllm.config import ModelConfig
+from vllm.config import ModelConfig, ModelImpl
+from vllm.logger import init_logger
 from vllm.model_executor.models import ModelRegistry
 from vllm.model_executor.models.adapters import (as_classification_model,
                                                  as_embedding_model,
                                                  as_reward_model)
 
+logger = init_logger(__name__)
+
 
 @contextlib.contextmanager
 def set_default_torch_dtype(dtype: torch.dtype):
@@ -22,6 +28,50 @@ def set_default_torch_dtype(dtype: torch.dtype):
     torch.set_default_dtype(old_dtype)
 
 
+def is_transformers_impl_compatible(
+        arch: str,
+        module: Optional[transformers.PreTrainedModel] = None) -> bool:
+    mod = module or getattr(transformers, arch, None)
+    if mod is None:
+        return False
+    if hasattr(mod, "supports_backend"):
+        return mod.is_backend_compatible()
+    else:
+        return mod._supports_flex_attn
+
+
+def resolve_transformers_fallback(model_config: ModelConfig,
+                                  architectures: list[str]):
+    for i, arch in enumerate(architectures):
+        if arch == "TransformersModel":
+            continue
+        custom_module = None
+        auto_map = getattr(model_config.hf_config, "auto_map", None)
+        if auto_map is not None and "AutoModel" in auto_map:
+            custom_module = get_class_from_dynamic_module(
+                model_config.hf_config.auto_map["AutoModel"],
+                model_config.model)
+        # TODO(Isotr0py): Further clean up these raises.
+        # perhaps handled them in _ModelRegistry._raise_for_unsupported?
+        if model_config.model_impl == ModelImpl.TRANSFORMERS:
+            if not is_transformers_impl_compatible(arch, custom_module):
+                raise ValueError(
+                    f"The Transformers implementation of {arch} is not "
+                    "compatible with vLLM.")
+            architectures[i] = "TransformersModel"
+        if model_config.model_impl == ModelImpl.AUTO:
+            if not is_transformers_impl_compatible(arch, custom_module):
+                raise ValueError(
+                    f"{arch} has no vLLM implementation and the Transformers "
+                    "implementation is not compatible with vLLM.")
+            logger.warning(
+                "%s has no vLLM implementation, falling back to Transformers "
+                "implementation. Some features may not be supported and "
+                "performance may not be optimal.", arch)
+            architectures[i] = "TransformersModel"
+    return architectures
+
+
 def get_model_architecture(
         model_config: ModelConfig) -> Tuple[Type[nn.Module], str]:
     architectures = getattr(model_config.hf_config, "architectures", [])
@@ -37,6 +87,14 @@ def get_model_architecture(
             and "MixtralForCausalLM" in architectures):
         architectures = ["QuantMixtralForCausalLM"]
 
+    vllm_supported_archs = ModelRegistry.get_supported_archs()
+    is_vllm_supported = any(arch in vllm_supported_archs
+                            for arch in architectures)
+    if (not is_vllm_supported
+            or model_config.model_impl == ModelImpl.TRANSFORMERS):
+        architectures = resolve_transformers_fallback(model_config,
+                                                      architectures)
+
     model_cls, arch = ModelRegistry.resolve_model_cls(architectures)
     if model_config.task == "embed":
         model_cls = as_embedding_model(model_cls)
diff --git a/vllm/model_executor/model_loader/weight_utils.py b/vllm/model_executor/model_loader/weight_utils.py
index b764a940b1742..cade0a1dd5950 100644
--- a/vllm/model_executor/model_loader/weight_utils.py
+++ b/vllm/model_executor/model_loader/weight_utils.py
@@ -1,3 +1,4 @@
+# SPDX-License-Identifier: Apache-2.0
 """Utilities for downloading and initializing model weights."""
 import fnmatch
 import glob
@@ -652,9 +653,18 @@ def maybe_remap_kv_scale_name(name: str, params_dict: dict) -> Optional[str]:
         return remapped_name
 
     possible_scale_names = [".k_scale", ".v_scale"]
+    modelopt_scale_names = [
+        ".self_attn.k_proj.k_scale", ".self_attn.v_proj.v_scale"
+    ]
     for scale_name in possible_scale_names:
         if name.endswith(scale_name):
-            remapped_name = name.replace(scale_name, f".attn{scale_name}")
+            if any(mo_scale_name in name
+                   for mo_scale_name in modelopt_scale_names):
+                remapped_name = name.replace(
+                    f".self_attn.{scale_name[1]}_proj{scale_name}",
+                    f".self_attn.attn{scale_name}")
+            else:
+                remapped_name = name.replace(scale_name, f".attn{scale_name}")
             if remapped_name not in params_dict:
                 logger.warning_once(
                     f"Found {scale_name} in the checkpoint (e.g. {name}), "
diff --git a/vllm/model_executor/models/__init__.py b/vllm/model_executor/models/__init__.py
index a3ef9adad16d9..6be4a8341306e 100644
--- a/vllm/model_executor/models/__init__.py
+++ b/vllm/model_executor/models/__init__.py
@@ -1,3 +1,5 @@
+# SPDX-License-Identifier: Apache-2.0
+
 from .interfaces import (HasInnerState, SupportsLoRA, SupportsMultiModal,
                          SupportsPP, has_inner_state, supports_lora,
                          supports_multimodal, supports_pp)
diff --git a/vllm/model_executor/models/adapters.py b/vllm/model_executor/models/adapters.py
index 55e90b9d41950..3e1daa773fc83 100644
--- a/vllm/model_executor/models/adapters.py
+++ b/vllm/model_executor/models/adapters.py
@@ -1,3 +1,5 @@
+# SPDX-License-Identifier: Apache-2.0
+
 from collections.abc import Iterable
 from typing import TYPE_CHECKING, Any, Optional, TypeVar
 
diff --git a/vllm/model_executor/models/arctic.py b/vllm/model_executor/models/arctic.py
index fd6b5659df5d1..d015682aab47f 100644
--- a/vllm/model_executor/models/arctic.py
+++ b/vllm/model_executor/models/arctic.py
@@ -1,3 +1,4 @@
+# SPDX-License-Identifier: Apache-2.0
 """Inference-only Snowflake Arctic model."""
 from typing import Iterable, List, Optional, Set, Tuple, Union
 
diff --git a/vllm/model_executor/models/aria.py b/vllm/model_executor/models/aria.py
index 8c6873de13627..97502c38b9f00 100644
--- a/vllm/model_executor/models/aria.py
+++ b/vllm/model_executor/models/aria.py
@@ -1,3 +1,5 @@
+# SPDX-License-Identifier: Apache-2.0
+
 from typing import (Iterable, List, Mapping, Optional, Set, Tuple, TypedDict,
                     Union)
 
diff --git a/vllm/model_executor/models/baichuan.py b/vllm/model_executor/models/baichuan.py
index a923ed36a9db2..5dfaa727b75ae 100644
--- a/vllm/model_executor/models/baichuan.py
+++ b/vllm/model_executor/models/baichuan.py
@@ -1,3 +1,5 @@
+# SPDX-License-Identifier: Apache-2.0
+
 # Copyright 2022 EleutherAI and the HuggingFace Inc. team. All rights reserved.
 #
 # This code is based on EleutherAI's GPT-NeoX library and the GPT-NeoX
diff --git a/vllm/model_executor/models/bart.py b/vllm/model_executor/models/bart.py
index 57eb5adc82d5b..204c48d0d8969 100644
--- a/vllm/model_executor/models/bart.py
+++ b/vllm/model_executor/models/bart.py
@@ -1,3 +1,5 @@
+# SPDX-License-Identifier: Apache-2.0
+
 # Derived from BART implementation posted on HuggingFace; license below:
 #
 # coding=utf-8
diff --git a/vllm/model_executor/models/bert.py b/vllm/model_executor/models/bert.py
index 4be136543de15..4d0f5ac8ea5df 100644
--- a/vllm/model_executor/models/bert.py
+++ b/vllm/model_executor/models/bert.py
@@ -1,3 +1,5 @@
+# SPDX-License-Identifier: Apache-2.0
+
 from typing import Iterable, List, Optional, Set, Tuple
 
 import torch
diff --git a/vllm/model_executor/models/blip.py b/vllm/model_executor/models/blip.py
index 987dfaf44f228..bedbdceb7721d 100644
--- a/vllm/model_executor/models/blip.py
+++ b/vllm/model_executor/models/blip.py
@@ -1,3 +1,4 @@
+# SPDX-License-Identifier: Apache-2.0
 """Minimal implementation of BlipVisionModel intended to be only used 
 within a vision language model."""
 from typing import Iterable, Optional, Set, Tuple, Union
diff --git a/vllm/model_executor/models/blip2.py b/vllm/model_executor/models/blip2.py
index 09c5087c2dc36..2b04522223d0e 100644
--- a/vllm/model_executor/models/blip2.py
+++ b/vllm/model_executor/models/blip2.py
@@ -1,3 +1,5 @@
+# SPDX-License-Identifier: Apache-2.0
+
 from functools import cached_property
 from typing import (Iterable, List, Literal, Mapping, Optional, Set, Tuple,
                     TypedDict, Union)
@@ -481,14 +483,14 @@ class Blip2MultiModalProcessor(BaseMultiModalProcessor[Blip2ProcessingInfo]):
         bos_token_id = tokenizer.bos_token_id
         assert isinstance(bos_token_id, int)
 
-        image_token_id = vocab["image"]
+        image_token_id = vocab["<image>"]
         num_image_tokens = self.info.get_num_image_tokens()
         image_tokens = [image_token_id] * num_image_tokens
 
         return [
             PromptReplacement(
                 modality="image",
-                target="</s>",
+                target=[bos_token_id],
                 replacement=PromptReplacementDetails(
                     full=image_tokens + [bos_token_id],
                     features=image_tokens,
diff --git a/vllm/model_executor/models/bloom.py b/vllm/model_executor/models/bloom.py
index fee74f491acc1..229677ae7d980 100644
--- a/vllm/model_executor/models/bloom.py
+++ b/vllm/model_executor/models/bloom.py
@@ -1,3 +1,5 @@
+# SPDX-License-Identifier: Apache-2.0
+
 # Adapted from
 # https://github.com/huggingface/transformers/blob/v4.28.0/src/transformers/models/bloom/modeling_bloom.py
 # Copyright 2023 The vLLM team.
diff --git a/vllm/model_executor/models/chameleon.py b/vllm/model_executor/models/chameleon.py
index e834c9004f140..9061a31280e64 100644
--- a/vllm/model_executor/models/chameleon.py
+++ b/vllm/model_executor/models/chameleon.py
@@ -1,3 +1,5 @@
+# SPDX-License-Identifier: Apache-2.0
+
 from functools import cached_property
 from typing import (Any, Dict, Iterable, List, Literal, Mapping, Optional, Set,
                     Tuple, TypedDict, Union)
diff --git a/vllm/model_executor/models/chatglm.py b/vllm/model_executor/models/chatglm.py
index d5f9b4d19e5ca..b81a9e917d455 100644
--- a/vllm/model_executor/models/chatglm.py
+++ b/vllm/model_executor/models/chatglm.py
@@ -1,3 +1,5 @@
+# SPDX-License-Identifier: Apache-2.0
+
 # Adapted from
 # https://github.com/THUDM/CogAgent
 """Inference-only CogAgent model compatible with THUDM weights."""
diff --git a/vllm/model_executor/models/clip.py b/vllm/model_executor/models/clip.py
index dd69f6c9a5aff..1e784f5b41728 100644
--- a/vllm/model_executor/models/clip.py
+++ b/vllm/model_executor/models/clip.py
@@ -1,3 +1,4 @@
+# SPDX-License-Identifier: Apache-2.0
 """Minimal implementation of CLIPVisionModel intended to be only used
 within a vision language model."""
 from typing import Iterable, List, Optional, Set, Tuple, Union
diff --git a/vllm/model_executor/models/commandr.py b/vllm/model_executor/models/commandr.py
index 989056bf5c155..e73627da05d40 100644
--- a/vllm/model_executor/models/commandr.py
+++ b/vllm/model_executor/models/commandr.py
@@ -1,3 +1,5 @@
+# SPDX-License-Identifier: Apache-2.0
+
 # Copyright 2024 Cohere and the HuggingFace Inc. team. All rights reserved.
 #
 # This code is based on EleutherAI's GPT-NeoX library and the GPT-NeoX
diff --git a/vllm/model_executor/models/dbrx.py b/vllm/model_executor/models/dbrx.py
index b2aa3c0709bd4..bb3f4f40dd211 100644
--- a/vllm/model_executor/models/dbrx.py
+++ b/vllm/model_executor/models/dbrx.py
@@ -1,3 +1,5 @@
+# SPDX-License-Identifier: Apache-2.0
+
 from typing import Iterable, List, Optional, Set, Tuple, Union
 
 import torch
diff --git a/vllm/model_executor/models/decilm.py b/vllm/model_executor/models/decilm.py
index c551853956b92..b239b642f752b 100644
--- a/vllm/model_executor/models/decilm.py
+++ b/vllm/model_executor/models/decilm.py
@@ -1,3 +1,5 @@
+# SPDX-License-Identifier: Apache-2.0
+
 # Adapted from
 # https://github.com/huggingface/transformers/blob/v4.28.0/src/transformers/models/llama/modeling_llama.py
 # Copyright 2023 DeciAI Research Team. All rights reserved.
diff --git a/vllm/model_executor/models/deepseek.py b/vllm/model_executor/models/deepseek.py
index 74b6bfdf21909..9599e1df6a3cb 100644
--- a/vllm/model_executor/models/deepseek.py
+++ b/vllm/model_executor/models/deepseek.py
@@ -1,3 +1,5 @@
+# SPDX-License-Identifier: Apache-2.0
+
 # Adapted from
 # https://github.com/huggingface/transformers/blob/v4.28.0/src/transformers/models/llama/modeling_llama.py
 # Copyright 2023 The vLLM team.
diff --git a/vllm/model_executor/models/deepseek_v2.py b/vllm/model_executor/models/deepseek_v2.py
index af6810a140b43..f5fede4d82268 100644
--- a/vllm/model_executor/models/deepseek_v2.py
+++ b/vllm/model_executor/models/deepseek_v2.py
@@ -1,3 +1,5 @@
+# SPDX-License-Identifier: Apache-2.0
+
 # Adapted from
 # https://github.com/huggingface/transformers/blob/v4.28.0/src/transformers/models/llama/modeling_llama.py
 # Copyright 2023 The vLLM team.
@@ -28,7 +30,7 @@ from transformers import PretrainedConfig
 
 from vllm.attention import Attention, AttentionMetadata
 from vllm.compilation.decorators import support_torch_compile
-from vllm.config import CacheConfig, VllmConfig
+from vllm.config import CacheConfig, ModelConfig, VllmConfig
 from vllm.distributed import (get_pp_group,
                               get_tensor_model_parallel_world_size,
                               tensor_model_parallel_all_reduce)
@@ -326,12 +328,156 @@ class DeepseekV2Attention(nn.Module):
         return output
 
 
+class DeepseekV2MLAAttention(nn.Module):
+    """
+    Main reference: DeepseekV2 paper, and FlashInfer Implementation
+    (https://arxiv.org/abs/2405.04434 and https://github.com/flashinfer-ai/flashinfer/pull/551).
+    
+    For more info see MLACommonImpl in: vllm/attention/backends/mla/utils.py
+    """
+
+    def __init__(
+        self,
+        config: PretrainedConfig,
+        hidden_size: int,
+        num_heads: int,
+        qk_nope_head_dim: int,
+        qk_rope_head_dim: int,
+        v_head_dim: int,
+        q_lora_rank: Optional[int],
+        kv_lora_rank: int,
+        rope_theta: float = 10000,
+        rope_scaling: Optional[Dict[str, Any]] = None,
+        max_position_embeddings: int = 8192,
+        cache_config: Optional[CacheConfig] = None,
+        quant_config: Optional[QuantizationConfig] = None,
+        prefix: str = "",
+    ) -> None:
+        super().__init__()
+        self.hidden_size = hidden_size
+        self.qk_nope_head_dim = qk_nope_head_dim
+        self.qk_rope_head_dim = qk_rope_head_dim
+        self.qk_head_dim = qk_nope_head_dim + qk_rope_head_dim
+        self.v_head_dim = v_head_dim
+
+        self.q_lora_rank = q_lora_rank
+        self.kv_lora_rank = kv_lora_rank
+
+        self.num_heads = num_heads
+        tp_size = get_tensor_model_parallel_world_size()
+        assert num_heads % tp_size == 0
+        self.num_local_heads = num_heads // tp_size
+
+        self.scaling = self.qk_head_dim**-0.5
+        self.rope_theta = rope_theta
+        self.max_position_embeddings = max_position_embeddings
+
+        if self.q_lora_rank is not None:
+            self.q_a_proj = ReplicatedLinear(self.hidden_size,
+                                             self.q_lora_rank,
+                                             bias=False,
+                                             quant_config=quant_config,
+                                             prefix=f"{prefix}.q_a_proj")
+            self.q_a_layernorm = RMSNorm(self.q_lora_rank,
+                                         eps=config.rms_norm_eps)
+            self.q_b_proj = ColumnParallelLinear(q_lora_rank,
+                                                 self.num_heads *
+                                                 self.qk_head_dim,
+                                                 bias=False,
+                                                 quant_config=quant_config,
+                                                 prefix=f"{prefix}.q_b_proj")
+        else:
+            self.q_proj = ColumnParallelLinear(self.hidden_size,
+                                               self.num_heads *
+                                               self.qk_head_dim,
+                                               bias=False,
+                                               quant_config=quant_config,
+                                               prefix=f"{prefix}.q_proj")
+
+        self.kv_a_proj_with_mqa = ReplicatedLinear(
+            self.hidden_size,
+            self.kv_lora_rank + self.qk_rope_head_dim,
+            bias=False,
+            quant_config=quant_config,
+            prefix=f"{prefix}.kv_a_proj_with_mqa")
+        self.kv_a_layernorm = RMSNorm(self.kv_lora_rank,
+                                      eps=config.rms_norm_eps)
+        self.kv_b_proj = ColumnParallelLinear(
+            self.kv_lora_rank,
+            self.num_heads * (self.qk_nope_head_dim + self.v_head_dim),
+            bias=False,
+            quant_config=quant_config,
+            prefix=f"{prefix}.kv_b_proj")
+        self.o_proj = RowParallelLinear(self.num_heads * self.v_head_dim,
+                                        self.hidden_size,
+                                        bias=False,
+                                        quant_config=quant_config,
+                                        prefix=f"{prefix}.o_proj")
+
+        rope_scaling["rope_type"] = 'deepseek_yarn'
+        self.rotary_emb = get_rope(qk_rope_head_dim,
+                                   rotary_dim=qk_rope_head_dim,
+                                   max_position=max_position_embeddings,
+                                   base=rope_theta,
+                                   rope_scaling=rope_scaling,
+                                   is_neox_style=False)
+        if rope_scaling:
+            mscale_all_dim = rope_scaling.get("mscale_all_dim", False)
+            scaling_factor = rope_scaling["factor"]
+            mscale = yarn_get_mscale(scaling_factor, float(mscale_all_dim))
+            self.scaling = self.scaling * mscale * mscale
+
+        self.mla_attn = Attention(
+            num_heads=self.num_local_heads,
+            head_size=self.kv_lora_rank,
+            scale=self.scaling,
+            num_kv_heads=1,
+            cache_config=cache_config,
+            quant_config=quant_config,
+            prefix=f"{prefix}.attn",
+            use_mla=True,
+            # MLA Args
+            q_lora_rank=self.q_lora_rank,
+            kv_lora_rank=self.kv_lora_rank,
+            qk_nope_head_dim=self.qk_nope_head_dim,
+            qk_rope_head_dim=self.qk_rope_head_dim,
+            qk_head_dim=self.qk_head_dim,
+            v_head_dim=self.v_head_dim,
+            rotary_emb=self.rotary_emb,
+            q_proj=self.q_proj if self.q_lora_rank is None else self.q_b_proj,
+            kv_b_proj=self.kv_b_proj,
+            o_proj=self.o_proj,
+        )
+
+        self.prefix = prefix
+        self.debug_layer_idx = int(self.prefix.split(".")[-2])
+
+    def forward(
+        self,
+        positions: torch.Tensor,
+        hidden_states: torch.Tensor,
+        kv_cache: torch.Tensor,
+        attn_metadata: AttentionMetadata,
+    ) -> torch.Tensor:
+        if self.q_lora_rank is not None:
+            ckq = self.q_a_proj(hidden_states)[0]
+            hidden_states_or_q_c = self.q_a_layernorm(ckq)
+        else:
+            hidden_states_or_q_c = hidden_states
+        kv_c, k_pe = self.kv_a_proj_with_mqa(hidden_states)[0].split(
+            [self.kv_lora_rank, self.qk_rope_head_dim], dim=-1)
+        kv_c_normed = self.kv_a_layernorm(kv_c.contiguous())
+        return self.mla_attn(hidden_states_or_q_c, kv_c_normed, k_pe, kv_cache,
+                             attn_metadata)
+
+
 class DeepseekV2DecoderLayer(nn.Module):
 
     def __init__(
         self,
         config: PretrainedConfig,
         prefix: str,
+        model_config: ModelConfig,
         cache_config: Optional[CacheConfig] = None,
         quant_config: Optional[QuantizationConfig] = None,
     ) -> None:
@@ -344,7 +490,11 @@ class DeepseekV2DecoderLayer(nn.Module):
         # DecoderLayers are created with `make_layers` which passes the prefix
         # with the layer's index.
         layer_idx = int(prefix.split(sep='.')[-1])
-        self.self_attn = DeepseekV2Attention(
+        if model_config.use_mla:
+            attn_cls = DeepseekV2MLAAttention
+        else:
+            attn_cls = DeepseekV2Attention
+        self.self_attn = attn_cls(
             config=config,
             hidden_size=self.hidden_size,
             num_heads=config.num_attention_heads,
@@ -421,6 +571,7 @@ class DeepseekV2Model(nn.Module):
         super().__init__()
 
         config = vllm_config.model_config.hf_config
+        model_config = vllm_config.model_config
         cache_config = vllm_config.cache_config
         quant_config = vllm_config.quant_config
 
@@ -440,6 +591,7 @@ class DeepseekV2Model(nn.Module):
             lambda prefix: DeepseekV2DecoderLayer(
                 config,
                 prefix,
+                model_config=model_config,
                 cache_config=cache_config,
                 quant_config=quant_config,
             ),
diff --git a/vllm/model_executor/models/deepseek_v3.py b/vllm/model_executor/models/deepseek_v3.py
index 0b44f0d062c40..a4829aa1a572b 100644
--- a/vllm/model_executor/models/deepseek_v3.py
+++ b/vllm/model_executor/models/deepseek_v3.py
@@ -1,3 +1,5 @@
+# SPDX-License-Identifier: Apache-2.0
+
 # Adapted from
 # https://github.com/huggingface/transformers/blob/v4.28.0/src/transformers/models/llama/modeling_llama.py
 # Copyright 2023 The vLLM team.
@@ -27,7 +29,8 @@ from torch import nn
 from transformers import PretrainedConfig
 
 from vllm.attention import Attention, AttentionMetadata
-from vllm.config import CacheConfig, VllmConfig
+from vllm.compilation.decorators import support_torch_compile
+from vllm.config import CacheConfig, ModelConfig, VllmConfig
 from vllm.distributed import (get_pp_group,
                               get_tensor_model_parallel_world_size,
                               tensor_model_parallel_all_reduce)
@@ -333,12 +336,156 @@ class DeepseekV3Attention(nn.Module):
         return output
 
 
+class DeepseekV3MLAAttention(nn.Module):
+    """
+    Main reference: DeepseekV2 paper, and FlashInfer Implementation
+    (https://arxiv.org/abs/2405.04434 and https://github.com/flashinfer-ai/flashinfer/pull/551).
+    
+    For more info see MLACommonImpl in: vllm/attention/backends/mla/utils.py
+    """
+
+    def __init__(
+        self,
+        config: PretrainedConfig,
+        hidden_size: int,
+        num_heads: int,
+        qk_nope_head_dim: int,
+        qk_rope_head_dim: int,
+        v_head_dim: int,
+        q_lora_rank: Optional[int],
+        kv_lora_rank: int,
+        rope_theta: float = 10000,
+        rope_scaling: Optional[Dict[str, Any]] = None,
+        max_position_embeddings: int = 8192,
+        cache_config: Optional[CacheConfig] = None,
+        quant_config: Optional[QuantizationConfig] = None,
+        prefix: str = "",
+    ) -> None:
+        super().__init__()
+        self.hidden_size = hidden_size
+        self.qk_nope_head_dim = qk_nope_head_dim
+        self.qk_rope_head_dim = qk_rope_head_dim
+        self.qk_head_dim = qk_nope_head_dim + qk_rope_head_dim
+        self.v_head_dim = v_head_dim
+
+        self.q_lora_rank = q_lora_rank
+        self.kv_lora_rank = kv_lora_rank
+
+        self.num_heads = num_heads
+        tp_size = get_tensor_model_parallel_world_size()
+        assert num_heads % tp_size == 0
+        self.num_local_heads = num_heads // tp_size
+
+        self.scaling = self.qk_head_dim**-0.5
+        self.rope_theta = rope_theta
+        self.max_position_embeddings = max_position_embeddings
+
+        if self.q_lora_rank is not None:
+            self.q_a_proj = ReplicatedLinear(self.hidden_size,
+                                             self.q_lora_rank,
+                                             bias=False,
+                                             quant_config=quant_config,
+                                             prefix=f"{prefix}.q_a_proj")
+            self.q_a_layernorm = RMSNorm(self.q_lora_rank,
+                                         eps=config.rms_norm_eps)
+            self.q_b_proj = ColumnParallelLinear(q_lora_rank,
+                                                 self.num_heads *
+                                                 self.qk_head_dim,
+                                                 bias=False,
+                                                 quant_config=quant_config,
+                                                 prefix=f"{prefix}.q_b_proj")
+        else:
+            self.q_proj = ColumnParallelLinear(self.hidden_size,
+                                               self.num_heads *
+                                               self.qk_head_dim,
+                                               bias=False,
+                                               quant_config=quant_config,
+                                               prefix=f"{prefix}.q_proj")
+
+        self.kv_a_proj_with_mqa = ReplicatedLinear(
+            self.hidden_size,
+            self.kv_lora_rank + self.qk_rope_head_dim,
+            bias=False,
+            quant_config=quant_config,
+            prefix=f"{prefix}.kv_a_proj_with_mqa")
+        self.kv_a_layernorm = RMSNorm(self.kv_lora_rank,
+                                      eps=config.rms_norm_eps)
+        self.kv_b_proj = ColumnParallelLinear(
+            self.kv_lora_rank,
+            self.num_heads * (self.qk_nope_head_dim + self.v_head_dim),
+            bias=False,
+            quant_config=quant_config,
+            prefix=f"{prefix}.kv_b_proj")
+        self.o_proj = RowParallelLinear(self.num_heads * self.v_head_dim,
+                                        self.hidden_size,
+                                        bias=False,
+                                        quant_config=quant_config,
+                                        prefix=f"{prefix}.o_proj")
+
+        rope_scaling["rope_type"] = 'deepseek_yarn'
+        self.rotary_emb = get_rope(qk_rope_head_dim,
+                                   rotary_dim=qk_rope_head_dim,
+                                   max_position=max_position_embeddings,
+                                   base=rope_theta,
+                                   rope_scaling=rope_scaling,
+                                   is_neox_style=False)
+        if rope_scaling:
+            mscale_all_dim = rope_scaling.get("mscale_all_dim", False)
+            scaling_factor = rope_scaling["factor"]
+            mscale = yarn_get_mscale(scaling_factor, float(mscale_all_dim))
+            self.scaling = self.scaling * mscale * mscale
+
+        self.mla_attn = Attention(
+            num_heads=self.num_local_heads,
+            head_size=self.kv_lora_rank,
+            scale=self.scaling,
+            num_kv_heads=1,
+            cache_config=cache_config,
+            quant_config=quant_config,
+            prefix=f"{prefix}.attn",
+            use_mla=True,
+            # MLA Args
+            q_lora_rank=self.q_lora_rank,
+            kv_lora_rank=self.kv_lora_rank,
+            qk_nope_head_dim=self.qk_nope_head_dim,
+            qk_rope_head_dim=self.qk_rope_head_dim,
+            qk_head_dim=self.qk_head_dim,
+            v_head_dim=self.v_head_dim,
+            rotary_emb=self.rotary_emb,
+            q_proj=self.q_proj if self.q_lora_rank is None else self.q_b_proj,
+            kv_b_proj=self.kv_b_proj,
+            o_proj=self.o_proj,
+        )
+
+        self.prefix = prefix
+        self.debug_layer_idx = int(self.prefix.split(".")[-2])
+
+    def forward(
+        self,
+        positions: torch.Tensor,
+        hidden_states: torch.Tensor,
+        kv_cache: torch.Tensor,
+        attn_metadata: AttentionMetadata,
+    ) -> torch.Tensor:
+        if self.q_lora_rank is not None:
+            ckq = self.q_a_proj(hidden_states)[0]
+            hidden_states_or_q_c = self.q_a_layernorm(ckq)
+        else:
+            hidden_states_or_q_c = hidden_states
+        kv_c, k_pe = self.kv_a_proj_with_mqa(hidden_states)[0].split(
+            [self.kv_lora_rank, self.qk_rope_head_dim], dim=-1)
+        kv_c_normed = self.kv_a_layernorm(kv_c.contiguous())
+        return self.mla_attn(hidden_states_or_q_c, kv_c_normed, k_pe, kv_cache,
+                             attn_metadata)
+
+
 class DeepseekV3DecoderLayer(nn.Module):
 
     def __init__(
         self,
         config: PretrainedConfig,
         prefix: str,
+        model_config: ModelConfig,
         cache_config: Optional[CacheConfig] = None,
         quant_config: Optional[QuantizationConfig] = None,
     ) -> None:
@@ -351,7 +498,11 @@ class DeepseekV3DecoderLayer(nn.Module):
         # DecoderLayers are created with `make_layers` which passes the prefix
         # with the layer's index.
         layer_idx = int(prefix.split(sep='.')[-1])
-        self.self_attn = DeepseekV3Attention(
+        if model_config.use_mla:
+            attn_cls = DeepseekV3MLAAttention
+        else:
+            attn_cls = DeepseekV3Attention
+        self.self_attn = attn_cls(
             config=config,
             hidden_size=self.hidden_size,
             num_heads=config.num_attention_heads,
@@ -418,8 +569,7 @@ class DeepseekV3DecoderLayer(nn.Module):
         return hidden_states, residual
 
 
-# TODO(simon): check whether we support torch compile for Deepseek V3
-# @support_torch_compile
+@support_torch_compile
 class DeepseekV3Model(nn.Module):
 
     fall_back_to_pt_during_load = False
@@ -428,6 +578,7 @@ class DeepseekV3Model(nn.Module):
         super().__init__()
 
         config = vllm_config.model_config.hf_config
+        model_config = vllm_config.model_config
         cache_config = vllm_config.cache_config
         quant_config = vllm_config.quant_config
 
@@ -447,6 +598,7 @@ class DeepseekV3Model(nn.Module):
             lambda prefix: DeepseekV3DecoderLayer(
                 config,
                 prefix,
+                model_config=model_config,
                 cache_config=cache_config,
                 quant_config=quant_config,
             ),
diff --git a/vllm/model_executor/models/deepseek_vl2.py b/vllm/model_executor/models/deepseek_vl2.py
index 344832d8b33e6..1343b9762874b 100644
--- a/vllm/model_executor/models/deepseek_vl2.py
+++ b/vllm/model_executor/models/deepseek_vl2.py
@@ -1,3 +1,5 @@
+# SPDX-License-Identifier: Apache-2.0
+
 # adapted from https://github.com/deepseek-ai/DeepSeek-VL2/blob/faf18023f24b962b32d9f0a2d89e402a8d383a78/deepseek_vl2/models/modeling_deepseek_vl_v2.py
 """Inference-only Deepseek-VL2 model compatible with HuggingFace weights."""
 import math
diff --git a/vllm/model_executor/models/eagle.py b/vllm/model_executor/models/eagle.py
index 948560b4906b8..373a728be89cb 100644
--- a/vllm/model_executor/models/eagle.py
+++ b/vllm/model_executor/models/eagle.py
@@ -1,3 +1,5 @@
+# SPDX-License-Identifier: Apache-2.0
+
 from typing import Iterable, List, Optional, Tuple
 
 import torch
diff --git a/vllm/model_executor/models/exaone.py b/vllm/model_executor/models/exaone.py
index bc3295da7b60a..2eb91a682242c 100644
--- a/vllm/model_executor/models/exaone.py
+++ b/vllm/model_executor/models/exaone.py
@@ -1,3 +1,5 @@
+# SPDX-License-Identifier: Apache-2.0
+
 # Adapted from
 # https://huggingface.co/LGAI-EXAONE/EXAONE-3.0-7.8B-Instruct/blob/main/modeling_exaone.py
 # Copyright 2024 The LG U+ CTO AI Tech Lab.
diff --git a/vllm/model_executor/models/fairseq2_llama.py b/vllm/model_executor/models/fairseq2_llama.py
index b93a68680375d..310aca999bc2d 100644
--- a/vllm/model_executor/models/fairseq2_llama.py
+++ b/vllm/model_executor/models/fairseq2_llama.py
@@ -1,3 +1,5 @@
+# SPDX-License-Identifier: Apache-2.0
+
 # Copyright 2024 The vLLM team.
 # Copyright 2024 Meta Platforms, Inc. and affiliates. All rights reserved.
 #
diff --git a/vllm/model_executor/models/falcon.py b/vllm/model_executor/models/falcon.py
index c503a368e8244..01b66a1c2a5fc 100644
--- a/vllm/model_executor/models/falcon.py
+++ b/vllm/model_executor/models/falcon.py
@@ -1,3 +1,5 @@
+# SPDX-License-Identifier: Apache-2.0
+
 # Adapted from
 # https://github.com/huggingface/transformers/blob/a5cc30d72ae2dc19af534e4b35c986cc28db1275/src/transformers/models/falcon/modeling_falcon.py
 # Copyright 2023 The vLLM team.
diff --git a/vllm/model_executor/models/florence2.py b/vllm/model_executor/models/florence2.py
index 3a5fe8e1f4144..4a1ad5f4ee0ce 100644
--- a/vllm/model_executor/models/florence2.py
+++ b/vllm/model_executor/models/florence2.py
@@ -1,3 +1,5 @@
+# SPDX-License-Identifier: Apache-2.0
+
 import math
 from typing import Iterable, List, Optional, Set, Tuple
 
diff --git a/vllm/model_executor/models/fuyu.py b/vllm/model_executor/models/fuyu.py
index dbf9da50cc9de..6d8c829687ee2 100644
--- a/vllm/model_executor/models/fuyu.py
+++ b/vllm/model_executor/models/fuyu.py
@@ -1,3 +1,5 @@
+# SPDX-License-Identifier: Apache-2.0
+
 # adapted from https://github.com/huggingface/transformers/blob/v4.39.3/src/transformers/models/fuyu/modeling_fuyu.py
 # Copyright 2023 The vLLM team.
 # Copyright 2023 HuggingFace Inc. team. All rights reserved.
diff --git a/vllm/model_executor/models/gemma.py b/vllm/model_executor/models/gemma.py
index 6de0c866bc2f0..cb81aa41e2542 100644
--- a/vllm/model_executor/models/gemma.py
+++ b/vllm/model_executor/models/gemma.py
@@ -1,3 +1,5 @@
+# SPDX-License-Identifier: Apache-2.0
+
 # Copyright 2023 The vLLM team.
 # Copyright (c) Google Inc.
 #
@@ -13,7 +15,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 """Inference-only Gemma model compatible with HuggingFace weights."""
-from functools import lru_cache
+from functools import cache
 from typing import Iterable, List, Optional, Set, Tuple, Union
 
 import torch
@@ -48,7 +50,7 @@ from .utils import (is_pp_missing_parameter,
 logger = init_logger(__name__)
 
 
-@lru_cache(maxsize=None)
+@cache
 def _get_gemma_act_fn(
     hidden_act: Optional[str],
     hidden_activation: Optional[str],
diff --git a/vllm/model_executor/models/gemma2.py b/vllm/model_executor/models/gemma2.py
index f0dc7693974be..a6dc8f84772b4 100644
--- a/vllm/model_executor/models/gemma2.py
+++ b/vllm/model_executor/models/gemma2.py
@@ -1,3 +1,5 @@
+# SPDX-License-Identifier: Apache-2.0
+
 # Copyright 2024 The vLLM team.
 # Copyright 2024 Google Inc. HuggingFace Inc. team. All rights reserved.
 #
diff --git a/vllm/model_executor/models/glm.py b/vllm/model_executor/models/glm.py
index 942d1e14baed1..5f1903345f0d7 100644
--- a/vllm/model_executor/models/glm.py
+++ b/vllm/model_executor/models/glm.py
@@ -1,3 +1,4 @@
+# SPDX-License-Identifier: Apache-2.0
 """Inference-only HF format GLM-4 model compatible with THUDM weights."""
 from vllm.config import VllmConfig
 from vllm.model_executor.models.llama import LlamaForCausalLM
diff --git a/vllm/model_executor/models/glm4_vision_encoder.py b/vllm/model_executor/models/glm4_vision_encoder.py
index 51922e6f2d03d..4449eb8e8b143 100644
--- a/vllm/model_executor/models/glm4_vision_encoder.py
+++ b/vllm/model_executor/models/glm4_vision_encoder.py
@@ -1,3 +1,5 @@
+# SPDX-License-Identifier: Apache-2.0
+
 # Adapted from
 # https://github.com/THUDM/GLM-4
 """Inference-only GLM-4v model visual encoder compatible with THUDM weights."""
diff --git a/vllm/model_executor/models/gpt2.py b/vllm/model_executor/models/gpt2.py
index 1656a3cc9e46d..7ad9a24dcbbcc 100644
--- a/vllm/model_executor/models/gpt2.py
+++ b/vllm/model_executor/models/gpt2.py
@@ -1,3 +1,5 @@
+# SPDX-License-Identifier: Apache-2.0
+
 # Adapted from
 # https://github.com/huggingface/transformers/blob/v4.28.0/src/transformers/models/gpt2/modeling_gpt2.py
 # Copyright 2023 The vLLM team.
@@ -258,13 +260,13 @@ class GPT2LMHeadModel(nn.Module, SupportsPP):
         self.transformer = GPT2Model(vllm_config=vllm_config,
                                      prefix=maybe_prefix(
                                          prefix, "transformer"))
+        self.lm_head = ParallelLMHead(self.config.vocab_size,
+                                      self.config.hidden_size,
+                                      quant_config=quant_config,
+                                      prefix=f"{prefix}.lm_head")
         if self.config.tie_word_embeddings:
-            self.lm_head = self.transformer.wte
-        else:
-            self.lm_head = ParallelLMHead(self.config.vocab_size,
-                                          self.config.hidden_size,
-                                          quant_config=quant_config,
-                                          prefix=f"{prefix}.lm_head")
+            self.lm_head = self.lm_head.tie_weights(self.transformer.wte)
+
         self.logits_processor = LogitsProcessor(config.vocab_size)
         self.sampler = get_sampler()
         self.make_empty_intermediate_tensors = (
@@ -309,15 +311,12 @@ class GPT2LMHeadModel(nn.Module, SupportsPP):
         params_dict = dict(self.named_parameters(remove_duplicate=False))
         loaded_params: Set[str] = set()
         for name, loaded_weight in weights:
-            if name.startswith("lm_head"):
-                # GPT-2 ties the weights of the embedding layer and the final
-                # linear layer.
-                continue
             if ".attn.bias" in name or ".attn.masked_bias" in name:
                 # Skip attention mask.
                 # NOTE: "c_attn.bias" should not be skipped.
                 continue
-            if not name.startswith("transformer."):
+            if not name.startswith("transformer.") and not name.startswith(
+                    "lm_head"):
                 name = "transformer." + name
 
             if is_pp_missing_parameter(name, self):
diff --git a/vllm/model_executor/models/gpt_bigcode.py b/vllm/model_executor/models/gpt_bigcode.py
index c64bc70688806..887a444748ae2 100644
--- a/vllm/model_executor/models/gpt_bigcode.py
+++ b/vllm/model_executor/models/gpt_bigcode.py
@@ -1,3 +1,5 @@
+# SPDX-License-Identifier: Apache-2.0
+
 # Adapted from
 # https://github.com/huggingface/transformers/blob/v4.28.0/src/transformers/models/gpt2/modeling_gpt2.py
 # Copyright 2023 The vLLM team.
diff --git a/vllm/model_executor/models/gpt_j.py b/vllm/model_executor/models/gpt_j.py
index 08298cc0db36f..815aba145d303 100644
--- a/vllm/model_executor/models/gpt_j.py
+++ b/vllm/model_executor/models/gpt_j.py
@@ -1,3 +1,5 @@
+# SPDX-License-Identifier: Apache-2.0
+
 # Adapted from
 # https://github.com/huggingface/transformers/blob/v4.28.0/src/transformers/models/gptj/modeling_gptj.py
 # Copyright 2023 The vLLM team.
diff --git a/vllm/model_executor/models/gpt_neox.py b/vllm/model_executor/models/gpt_neox.py
index 731642772011c..550ca3f7ca9e2 100644
--- a/vllm/model_executor/models/gpt_neox.py
+++ b/vllm/model_executor/models/gpt_neox.py
@@ -1,3 +1,5 @@
+# SPDX-License-Identifier: Apache-2.0
+
 # Adapted from
 # https://github.com/huggingface/transformers/blob/v4.28.0/src/transformers/models/gpt_neox/modeling_gpt_neox.py
 # Copyright 2023 The vLLM team.
diff --git a/vllm/model_executor/models/granite.py b/vllm/model_executor/models/granite.py
index 543b4e2f5e286..85911a0f41c2f 100644
--- a/vllm/model_executor/models/granite.py
+++ b/vllm/model_executor/models/granite.py
@@ -1,3 +1,5 @@
+# SPDX-License-Identifier: Apache-2.0
+
 # Adapted from
 # https://github.com/huggingface/transformers/blob/v4.28.0/src/transformers/models/llama/modeling_llama.py
 # Copyright 2023 The vLLM team.
diff --git a/vllm/model_executor/models/granitemoe.py b/vllm/model_executor/models/granitemoe.py
index 51296ef0cc08e..8ae661bf15c49 100644
--- a/vllm/model_executor/models/granitemoe.py
+++ b/vllm/model_executor/models/granitemoe.py
@@ -1,3 +1,5 @@
+# SPDX-License-Identifier: Apache-2.0
+
 # Adapted from
 # https://github.com/huggingface/transformers/blob/v4.28.0/src/transformers/models/llama/modeling_llama.py
 # Copyright 2023 The vLLM team.
@@ -348,6 +350,7 @@ class GraniteMoeForCausalLM(nn.Module, SupportsLoRA, SupportsPP):
 
         self.config = config
         self.lora_config = lora_config
+        self.quant_config = quant_config  # Required by MixtralForCausalLM
 
         self.model = GraniteMoeModel(vllm_config=vllm_config,
                                      prefix=maybe_prefix(prefix, "model"))
@@ -428,10 +431,10 @@ class GraniteMoeForCausalLM(nn.Module, SupportsLoRA, SupportsPP):
                 for e in range(p.size(0)):
                     w1_name = n.replace(
                         '.block_sparse_moe.input_linear.weight',
-                        ".block_sparse_moe.experts.%d.w1.weight" % e)
+                        f".block_sparse_moe.experts.{e}.w1.weight")
                     w3_name = n.replace(
                         '.block_sparse_moe.input_linear.weight',
-                        ".block_sparse_moe.experts.%d.w3.weight" % e)
+                        f".block_sparse_moe.experts.{e}.w3.weight")
                     w1_param, w3_param = p[e].chunk(2, dim=0)
                     assert w1_name not in new_weights
                     assert w3_name not in new_weights
@@ -441,7 +444,7 @@ class GraniteMoeForCausalLM(nn.Module, SupportsLoRA, SupportsPP):
                 for e in range(p.size(0)):
                     w2_name = n.replace(
                         '.block_sparse_moe.output_linear.weight',
-                        ".block_sparse_moe.experts.%d.w2.weight" % e)
+                        f".block_sparse_moe.experts.{e}.w2.weight")
                     w2_param = p[e]
                     assert w2_name not in new_weights
                     new_weights[w2_name] = w2_param
diff --git a/vllm/model_executor/models/gritlm.py b/vllm/model_executor/models/gritlm.py
index d179d6235424a..7bda54ea7689b 100644
--- a/vllm/model_executor/models/gritlm.py
+++ b/vllm/model_executor/models/gritlm.py
@@ -1,3 +1,5 @@
+# SPDX-License-Identifier: Apache-2.0
+
 from array import array
 from typing import List, Optional, Union
 
diff --git a/vllm/model_executor/models/h2ovl.py b/vllm/model_executor/models/h2ovl.py
index df7e768fe14d3..91c89b159ca92 100644
--- a/vllm/model_executor/models/h2ovl.py
+++ b/vllm/model_executor/models/h2ovl.py
@@ -1,3 +1,5 @@
+# SPDX-License-Identifier: Apache-2.0
+
 # adapted from https://huggingface.co/h2oai/h2ovl-mississippi-2b/blob/main/modeling_h2ovl_chat.py
 # https://huggingface.co/h2oai/h2ovl-mississippi-2b/blob/main/image_process.py
 # --------------------------------------------------------
diff --git a/vllm/model_executor/models/idefics2_vision_model.py b/vllm/model_executor/models/idefics2_vision_model.py
index 4e42a4b6f9e64..f9c2175b29881 100644
--- a/vllm/model_executor/models/idefics2_vision_model.py
+++ b/vllm/model_executor/models/idefics2_vision_model.py
@@ -1,3 +1,5 @@
+# SPDX-License-Identifier: Apache-2.0
+
 # adapted from https://github.com/huggingface/transformers/blob/v4.43.2/src/transformers/models/idefics2/modeling_idefics2.py
 # Copyright 2024 The vLLM team.
 # Copyright 2024 the HuggingFace Inc. team. All rights reserved.
diff --git a/vllm/model_executor/models/idefics3.py b/vllm/model_executor/models/idefics3.py
index d16a77f862d98..9e2e677a652e4 100644
--- a/vllm/model_executor/models/idefics3.py
+++ b/vllm/model_executor/models/idefics3.py
@@ -1,3 +1,5 @@
+# SPDX-License-Identifier: Apache-2.0
+
 # Copyright 2024 the HuggingFace Inc. team. All rights reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
diff --git a/vllm/model_executor/models/interfaces.py b/vllm/model_executor/models/interfaces.py
index c5fd0d9332379..0fc5c4db179ca 100644
--- a/vllm/model_executor/models/interfaces.py
+++ b/vllm/model_executor/models/interfaces.py
@@ -1,3 +1,5 @@
+# SPDX-License-Identifier: Apache-2.0
+
 from typing import (TYPE_CHECKING, ClassVar, Dict, List, Literal, Optional,
                     Protocol, Type, Union, overload, runtime_checkable)
 
diff --git a/vllm/model_executor/models/interfaces_base.py b/vllm/model_executor/models/interfaces_base.py
index 37b91a803d71e..c5f7be135d71a 100644
--- a/vllm/model_executor/models/interfaces_base.py
+++ b/vllm/model_executor/models/interfaces_base.py
@@ -1,3 +1,5 @@
+# SPDX-License-Identifier: Apache-2.0
+
 from typing import (TYPE_CHECKING, List, Optional, Protocol, Type, Union,
                     overload, runtime_checkable)
 
diff --git a/vllm/model_executor/models/intern_vit.py b/vllm/model_executor/models/intern_vit.py
index 8ad009d5101e4..0499f339b2465 100644
--- a/vllm/model_executor/models/intern_vit.py
+++ b/vllm/model_executor/models/intern_vit.py
@@ -1,3 +1,5 @@
+# SPDX-License-Identifier: Apache-2.0
+
 # adapted from https://huggingface.co/OpenGVLab/InternVL2-4B/blob/main/modeling_intern_vit.py
 # --------------------------------------------------------
 # InternVL
diff --git a/vllm/model_executor/models/internlm2.py b/vllm/model_executor/models/internlm2.py
index 28c23edd4c8e8..c211ca5f4f8e9 100644
--- a/vllm/model_executor/models/internlm2.py
+++ b/vllm/model_executor/models/internlm2.py
@@ -1,3 +1,5 @@
+# SPDX-License-Identifier: Apache-2.0
+
 from functools import partial
 from typing import Any, Dict, Iterable, List, Optional, Set, Tuple, Type, Union
 
diff --git a/vllm/model_executor/models/internlm2_ve.py b/vllm/model_executor/models/internlm2_ve.py
index 93ac2dcf8d587..106c3b6b78cc8 100644
--- a/vllm/model_executor/models/internlm2_ve.py
+++ b/vllm/model_executor/models/internlm2_ve.py
@@ -1,3 +1,5 @@
+# SPDX-License-Identifier: Apache-2.0
+
 from typing import List, Optional, Tuple, Union
 
 import torch
diff --git a/vllm/model_executor/models/internvl.py b/vllm/model_executor/models/internvl.py
index f4b7e4478c164..c46a867a76832 100644
--- a/vllm/model_executor/models/internvl.py
+++ b/vllm/model_executor/models/internvl.py
@@ -1,3 +1,5 @@
+# SPDX-License-Identifier: Apache-2.0
+
 # adapted from https://huggingface.co/OpenGVLab/InternVL2-4B/blob/main/modeling_internvl_chat.py
 # --------------------------------------------------------
 # InternVL
diff --git a/vllm/model_executor/models/jais.py b/vllm/model_executor/models/jais.py
index 8c81dff6b5768..72bcef5e2282f 100644
--- a/vllm/model_executor/models/jais.py
+++ b/vllm/model_executor/models/jais.py
@@ -1,3 +1,5 @@
+# SPDX-License-Identifier: Apache-2.0
+
 # Adapted from
 # https://huggingface.co/inceptionai/jais-30b-chat-v3/blob/main/modeling_jais.py
 # Copyright 2023 The vLLM team.
diff --git a/vllm/model_executor/models/jamba.py b/vllm/model_executor/models/jamba.py
index 890b5530b97d6..d82c0815213bc 100644
--- a/vllm/model_executor/models/jamba.py
+++ b/vllm/model_executor/models/jamba.py
@@ -1,3 +1,4 @@
+# SPDX-License-Identifier: Apache-2.0
 """Inference-only Jamba model."""
 from typing import Iterable, List, Optional, Set, Tuple
 
diff --git a/vllm/model_executor/models/llama.py b/vllm/model_executor/models/llama.py
index e214c30f5d60b..d91c8782a121c 100644
--- a/vllm/model_executor/models/llama.py
+++ b/vllm/model_executor/models/llama.py
@@ -1,3 +1,5 @@
+# SPDX-License-Identifier: Apache-2.0
+
 # Adapted from
 # https://github.com/huggingface/transformers/blob/v4.28.0/src/transformers/models/llama/modeling_llama.py
 # Copyright 2023 The vLLM team.
@@ -404,6 +406,11 @@ class LlamaModel(nn.Module):
                 weight_loader(param, loaded_weight)
                 loaded_params.add(scale_name)
                 continue
+            if "scale" in name:
+                # Remapping the name of FP8 kv-scale.
+                name = maybe_remap_kv_scale_name(name, params_dict)
+                if name is None:
+                    continue
             for param_name, weight_name, shard_id in stacked_params_mapping:
                 if weight_name not in name:
                     continue
@@ -423,10 +430,6 @@ class LlamaModel(nn.Module):
                 # Skip loading extra bias for GPTQ models.
                 if name.endswith(".bias") and name not in params_dict:
                     continue
-                # Remapping the name of FP8 kv-scale.
-                name = maybe_remap_kv_scale_name(name, params_dict)
-                if name is None:
-                    continue
 
                 if is_pp_missing_parameter(name, self):
                     continue
diff --git a/vllm/model_executor/models/llava.py b/vllm/model_executor/models/llava.py
index 296af2aac5660..de3777cad0589 100644
--- a/vllm/model_executor/models/llava.py
+++ b/vllm/model_executor/models/llava.py
@@ -1,3 +1,5 @@
+# SPDX-License-Identifier: Apache-2.0
+
 from abc import abstractmethod
 from functools import cached_property
 from typing import (Final, Iterable, List, Literal, Mapping, Optional,
diff --git a/vllm/model_executor/models/llava_next.py b/vllm/model_executor/models/llava_next.py
index fda4f22d366b1..185edcb8de11f 100644
--- a/vllm/model_executor/models/llava_next.py
+++ b/vllm/model_executor/models/llava_next.py
@@ -1,3 +1,5 @@
+# SPDX-License-Identifier: Apache-2.0
+
 from abc import abstractmethod
 from functools import cached_property
 from typing import (Final, Iterable, List, Literal, Mapping, Optional,
diff --git a/vllm/model_executor/models/llava_next_video.py b/vllm/model_executor/models/llava_next_video.py
index 5be85d7c0f033..a5002513554db 100644
--- a/vllm/model_executor/models/llava_next_video.py
+++ b/vllm/model_executor/models/llava_next_video.py
@@ -1,3 +1,5 @@
+# SPDX-License-Identifier: Apache-2.0
+
 import math
 from functools import cached_property
 from typing import (Iterable, List, Literal, Mapping, Optional, Set, Tuple,
diff --git a/vllm/model_executor/models/llava_onevision.py b/vllm/model_executor/models/llava_onevision.py
index 5b0f35b08646b..ac502000c3ee7 100644
--- a/vllm/model_executor/models/llava_onevision.py
+++ b/vllm/model_executor/models/llava_onevision.py
@@ -1,3 +1,5 @@
+# SPDX-License-Identifier: Apache-2.0
+
 import math
 from functools import cached_property
 from typing import (Final, Iterable, List, Literal, Mapping, Optional,
diff --git a/vllm/model_executor/models/mamba.py b/vllm/model_executor/models/mamba.py
index 553bc9c28cb21..5034b334564e8 100644
--- a/vllm/model_executor/models/mamba.py
+++ b/vllm/model_executor/models/mamba.py
@@ -1,3 +1,4 @@
+# SPDX-License-Identifier: Apache-2.0
 """PyTorch MAMBA model."""
 from typing import Iterable, List, Optional, Set, Tuple
 
diff --git a/vllm/model_executor/models/mamba_cache.py b/vllm/model_executor/models/mamba_cache.py
index 79393421f3ae9..353177f784b2e 100644
--- a/vllm/model_executor/models/mamba_cache.py
+++ b/vllm/model_executor/models/mamba_cache.py
@@ -1,3 +1,5 @@
+# SPDX-License-Identifier: Apache-2.0
+
 from dataclasses import dataclass
 from typing import Dict, List
 
diff --git a/vllm/model_executor/models/medusa.py b/vllm/model_executor/models/medusa.py
index 66bdcb89a0213..a19d7da5654b6 100644
--- a/vllm/model_executor/models/medusa.py
+++ b/vllm/model_executor/models/medusa.py
@@ -1,3 +1,5 @@
+# SPDX-License-Identifier: Apache-2.0
+
 from typing import Iterable, List, Optional, Set, Tuple
 
 import torch
diff --git a/vllm/model_executor/models/minicpm.py b/vllm/model_executor/models/minicpm.py
index 6254d26c7060d..29473f5bbaa0a 100644
--- a/vllm/model_executor/models/minicpm.py
+++ b/vllm/model_executor/models/minicpm.py
@@ -1,3 +1,5 @@
+# SPDX-License-Identifier: Apache-2.0
+
 # Adapted from
 # https://github.com/huggingface/transformers/blob/v4.28.0/src/transformers/models/llama/modeling_llama.py
 # Copyright 2023 The vLLM team.
diff --git a/vllm/model_executor/models/minicpm3.py b/vllm/model_executor/models/minicpm3.py
index 5e1e6c6fa6141..878f0c895c34b 100644
--- a/vllm/model_executor/models/minicpm3.py
+++ b/vllm/model_executor/models/minicpm3.py
@@ -1,3 +1,5 @@
+# SPDX-License-Identifier: Apache-2.0
+
 # Adapted from
 # https://github.com/huggingface/transformers/blob/v4.28.0/src/transformers/models/llama/modeling_llama.py
 # Copyright 2024 The ModelBest team.
diff --git a/vllm/model_executor/models/minicpmo.py b/vllm/model_executor/models/minicpmo.py
new file mode 100644
index 0000000000000..f1c1680768b8d
--- /dev/null
+++ b/vllm/model_executor/models/minicpmo.py
@@ -0,0 +1,813 @@
+# SPDX-License-Identifier: Apache-2.0
+
+# Adapted from
+# https://github.com/huggingface/transformers/blob/v4.28.0/src/transformers/models/llama/modeling_llama.py
+# Copyright 2023 The vLLM team.
+# Copyright 2022 EleutherAI and the HuggingFace Inc. team. All rights reserved.
+#
+# This code is based on EleutherAI's GPT-NeoX library and the GPT-NeoX
+# and OPT implementations in this library. It has been modified from its
+# original forms to accommodate minor architectural differences compared
+# to GPT-NeoX and OPT used by the Meta AI team that trained the model.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Inference-only MiniCPM-O model compatible with HuggingFace weights."""
+from functools import partial
+from itertools import accumulate
+from typing import (Any, Dict, Iterable, List, Literal, Mapping, Optional, Set,
+                    Tuple, TypedDict, Union)
+
+import torch
+import torch.types
+from torch import nn
+from transformers.modeling_outputs import BaseModelOutputWithPast
+from transformers.models.whisper.modeling_whisper import (
+    ACT2FN, WHISPER_ATTENTION_CLASSES, WhisperConfig, WhisperEncoder)
+
+from vllm.attention import AttentionMetadata
+from vllm.config import VllmConfig
+from vllm.multimodal import MULTIMODAL_REGISTRY, MultiModalKwargs
+from vllm.multimodal.inputs import MultiModalFieldConfig
+from vllm.multimodal.parse import (ModalityData, ModalityDataItems,
+                                   MultiModalDataItems, MultiModalDataParser,
+                                   VideoItem)
+from vllm.multimodal.processing import (BaseMultiModalProcessor,
+                                        PromptReplacement)
+from vllm.multimodal.profiling import ProcessorInputs
+from vllm.sequence import IntermediateTensors
+
+from .minicpmv import (MiniCPMV2_6, MiniCPMVDummyInputsBuilder,
+                       MiniCPMVEmbeddingItems, MiniCPMVMultiModalDataParser,
+                       MiniCPMVMultiModalProcessor, MiniCPMVProcessingInfo)
+from .utils import AutoWeightsLoader, maybe_prefix
+
+CPU_DEVICE = torch.device("cpu")
+
+MiniCPMOEmbeddingItems = MiniCPMVEmbeddingItems
+
+
+class MiniCPMOAudioFeatureInputs(TypedDict):
+    type: Literal["audio_features"]
+    data: torch.Tensor
+    """
+    Shape: `(batch_size * num_audios * num_slices, num_channels, length)`
+    Slice here means chunk. Audio that is too long will be split into slices,
+    which is the same as image.
+    Padding is used therefore `data` is `torch.Tensor`.
+    """
+
+    audio_feature_lens: torch.Tensor
+    """
+    Shape: `(batch_size * num_audios * num_slices)`
+
+    This should be feature length of each audio slice, 
+    which equals to `data.shape[-1]`
+    """
+
+    audio_bounds: torch.Tensor
+    """
+    Shape: `(batch_size * num_audios * num_slices, 2)`
+
+    This should be in `(start, stop)` format.
+    """
+
+
+class MiniCPMOAudioEmbeddingInputs(TypedDict):
+    type: Literal["audio_embeds"]
+    data: List[torch.Tensor]
+    """
+    Shape: `(batch_size * num_images * num_slices, hidden_size)`
+
+    `hidden_size` must match the hidden size of language model backbone.
+    instead of a batched tensor.
+    Length of each slice may vary, so pass it as a list.
+    """
+    audio_bounds: torch.Tensor
+    """
+    Shape: `(batch_size * num_audios * num_slices, 2)`
+
+    This should be in `(start, stop)` format.
+    """
+
+
+MiniCPMOAudioInputs = Union[MiniCPMOAudioFeatureInputs,
+                            MiniCPMOAudioEmbeddingInputs]
+
+
+class MiniCPMOAudioEmbeddingItems(MiniCPMOEmbeddingItems):
+
+    def __init__(self, data: Dict) -> None:
+        super().__init__(data, "audio")
+        audio_embeds = self.data.get("audio_embeds", None)
+        if audio_embeds is None:
+            raise ValueError("Incorrect type of video_embeds",
+                             "Got type: None")
+        self.data["audio_embeds"] = audio_embeds
+
+    def get(self, index: int) -> object:
+        return self.data["audio_embeds"][index]
+
+
+class MiniCPMOMultiModalDataParser(MiniCPMVMultiModalDataParser):
+
+    def _parse_audio_data(
+        self,
+        data: Union[dict[str, torch.Tensor], ModalityData[VideoItem]],
+    ) -> ModalityDataItems[Any, Any]:
+        if isinstance(data, dict):
+            return MiniCPMOAudioEmbeddingItems(data)
+        return super()._parse_audio_data(data)
+
+
+class MiniCPMOProcessingInfo(MiniCPMVProcessingInfo):
+    audio_pattern = "(<audio>./</audio>)"
+
+    def get_supported_mm_modalities(self) -> List[str]:
+        return ["image", "video", "audio"]
+
+    def get_supported_mm_limits(self) -> Mapping[str, Optional[int]]:
+        return {"image": None, "video": None, "audio": None}
+
+    def get_mm_max_tokens_per_item(self, seq_len: int) -> Mapping[str, int]:
+        return {
+            "image": self.get_max_image_tokens(),
+            "audio": self.get_max_audio_tokens(),
+            "video": self.get_max_video_tokens(seq_len)
+        }
+
+    def get_default_audio_pool_step(self) -> int:
+        return 2
+
+    def get_default_audio_sampling_rate(self) -> int:
+        return 16000
+
+    def get_chunk_length(self) -> int:
+        return self.get_hf_config().audio_chunk_length
+
+    def get_max_audio_tokens_per_chunk(self) -> int:
+        pool_step = self.get_default_audio_pool_step()
+        fbank_feat_in_chunk = 100
+        cnn_feat_in_chunk = (fbank_feat_in_chunk - 1) // 2 + 1
+        num_audio_tokens = (cnn_feat_in_chunk - pool_step) // pool_step + 1
+        return num_audio_tokens + 2  # <audio>(<unk>*N)</audio>
+
+    def get_max_audio_chunks_with_most_features(self) -> int:
+        return 30
+
+    def get_audio_len_by_num_chunks(self, num_chunks: int) -> int:
+        sampling_rate = self.get_default_audio_sampling_rate()
+        # exclude <audio> </audio>
+        num_tokens_per_chunk = self.get_max_audio_tokens_per_chunk() - 2
+        return int(num_chunks * sampling_rate / num_tokens_per_chunk) + 1
+
+    def get_num_frames_with_most_features(self, seq_len: int) -> int:
+        mm_config = self.ctx.get_mm_config()
+        max_images = mm_config.limit_per_prompt.get("image", 1)
+        max_videos = mm_config.limit_per_prompt.get("video", 1)
+        max_audios = mm_config.limit_per_prompt.get("audio", 1)
+
+        # count <image_idx></image_idx> tokens
+        # which are not in get_max_image_tokens
+        max_image_tokens = self.get_max_image_tokens(
+        ) * max_images + 4 * max_images
+        max_audio_tokens = self.get_max_audio_tokens(
+        ) * max_audios + 2 * max_audios
+        max_total_frames = self.get_max_video_frames(seq_len -
+                                                     max_image_tokens -
+                                                     max_audio_tokens)
+
+        num_frames = max(max_total_frames // max(max_videos, 1), 1)
+
+        return num_frames
+
+
+class MiniCPMODummyInputsBuilder(MiniCPMVDummyInputsBuilder):
+
+    def get_dummy_processor_inputs(
+            self, seq_len: int, mm_counts: Mapping[str,
+                                                   int]) -> ProcessorInputs:
+        num_audios = mm_counts.get("audio", 0)
+        audio_len = self.info.get_max_audio_chunks_with_most_features() * \
+            self.info.get_default_audio_sampling_rate()
+
+        processor_inputs = super().get_dummy_processor_inputs(
+            seq_len, mm_counts)
+        mm_data = {
+            "image":
+            processor_inputs.mm_data["image"],
+            "video":
+            processor_inputs.mm_data["video"],
+            "audio":
+            self._get_dummy_audios(length=audio_len, num_audios=num_audios)
+        }
+
+        audio_prompt_texts = self.info.audio_pattern * num_audios
+
+        return ProcessorInputs(prompt_text=processor_inputs.prompt_text + \
+                               audio_prompt_texts,
+                               mm_data=mm_data)
+
+
+class MiniCPMOMultiModalProcessor(
+        MiniCPMVMultiModalProcessor,
+        BaseMultiModalProcessor[MiniCPMOProcessingInfo]):
+
+    def _get_data_parser(self) -> MultiModalDataParser:
+        return MiniCPMOMultiModalDataParser(
+            target_sr=self.info.get_default_audio_sampling_rate())
+
+    def get_audio_prompt_texts(self,
+                               audio_lens: int,
+                               chunk_input: bool = True,
+                               chunk_length: int = 1) -> str:
+        return self.info.get_hf_processor().get_audio_placeholder(
+            audio_lens, chunk_input, chunk_length)
+
+    def get_special_tokens(self) -> Dict[str, torch.Tensor]:
+        tokenizer = self.info.get_tokenizer()
+        special_tokens = super().get_special_tokens()
+        if hasattr(tokenizer, "audio_start_id"):
+            special_tokens["audio_start_id"] = torch.tensor(
+                tokenizer.audio_start_id)
+            special_tokens["audio_end_id"] = torch.tensor(
+                tokenizer.audio_end_id)
+        return special_tokens
+
+    def process_audios(self, mm_data: Mapping[str, object],
+                       mm_kwargs: Mapping[str, object]) -> Dict[str, object]:
+        audios = mm_data.pop("audios", [])
+        audio_embeds = mm_data.pop("audio_embeds", [])
+        if isinstance(audios, (list, torch.Tensor)) and len(audios) > 0:
+            audio_outputs = {
+                "audio_lens": [],
+                "audio_features": [],
+                "audio_feature_lens": [],
+                "audio_num_segments": []
+            }
+            for audio in audios:
+                single_audio_outputs = super().call_base_hf_processor(
+                    prompt=self.info.audio_pattern,
+                    mm_data={
+                        "audios": audio,
+                        "chunk_input": True
+                    },
+                    mm_kwargs=mm_kwargs)
+                audio_outputs["audio_lens"].append(len(audio))
+                audio_outputs["audio_features"].append(
+                    single_audio_outputs["audio_features"])
+                audio_outputs["audio_num_segments"].append(
+                    len(single_audio_outputs["audio_feature_lens"][0]))
+                audio_outputs["audio_feature_lens"] += \
+                    single_audio_outputs["audio_feature_lens"]
+            audio_outputs["audio_features"] = [
+                audio_feature for single_audio_features in \
+                    audio_outputs["audio_features"]
+                for audio_feature in single_audio_features
+            ]
+            audio_outputs["audio_feature_lens"] = torch.cat(
+                audio_outputs["audio_feature_lens"])
+        elif len(audio_embeds):
+            audio_outputs = {
+                "audio_lens": [
+                    self.info.get_audio_len_by_num_chunks(
+                        sum(chunk_embeds.shape[0]
+                            for chunk_embeds in single_audio_embeds))
+                    for single_audio_embeds in audio_embeds
+                ],
+                "audio_embeds": [
+                    chunk_embeds for single_audio_embeds in audio_embeds
+                    for chunk_embeds in single_audio_embeds
+                ],
+                "audio_num_segments": [
+                    len(single_audio_embeds)
+                    for single_audio_embeds in audio_embeds
+                ]
+            }
+        else:
+            audio_outputs = {}
+        return audio_outputs
+
+    def get_placeholder_match_pattern(self) -> str:
+        return r"\(<(image|video|audio)>./</\1>\)"
+
+    def get_placeholder_split_pattern(self) -> str:
+        return r"\(<(?:image|video|audio)>./</(?:image|video|audio)>\)"
+
+    def process_mm_inputs(self, mm_data, mm_kwargs) -> object:
+        return {
+            "image": self.process_images(mm_data, mm_kwargs),
+            "video": self.process_videos(mm_data, mm_kwargs),
+            "audio": self.process_audios(mm_data, mm_kwargs)
+        }
+
+    def get_modality_num_counter(self, modality: str) -> str:
+        if modality == "audio":
+            return "audio_lens"
+        return super().get_modality_num_counter(modality)
+
+    def get_num_slices_by_modality(self, inputs: Dict[str, object],
+                                   modality: str, index: int) -> int:
+        if modality == "audio":
+            return inputs["audio"]["audio_num_segments"][index]
+        return super().get_num_slices_by_modality(inputs, modality, index)
+
+    def get_prompt_texts_by_modality(self, inputs: Dict[str, object],
+                                     modality: str, index: int) -> str:
+        if modality == "audio":
+            return self.get_audio_prompt_texts(
+                inputs["audio"]["audio_lens"][index])
+        return super().get_prompt_texts_by_modality(inputs, modality, index)
+
+    def _get_prompt_replacements(
+            self, mm_items: MultiModalDataItems,
+            hf_processor_mm_kwargs: Mapping[str, Any],
+            out_mm_kwargs: MultiModalKwargs) -> List[PromptReplacement]:
+        placeholder = {
+            "image": self.info.image_pattern,
+            "video": self.info.video_pattern,
+            "audio": self.info.audio_pattern
+        }
+
+        def get_replacement_minicpmv(item_idx: int, modality: str):
+            if modality == "image":
+                return self.get_image_prompt_texts(
+                    mm_items["image"].get_image_size(item_idx), item_idx)
+            elif modality == "video":
+                return self.get_video_prompt_texts(
+                    mm_items["video"].get_frame_size(item_idx),
+                    mm_items["video"].get_num_frames(item_idx))
+            else:  # audio
+                if isinstance(mm_items["audio"], MiniCPMOAudioEmbeddingItems):
+                    single_audio_embeds = mm_items["audio"].get(item_idx)
+                    audio_len = self.info.get_audio_len_by_num_chunks(
+                        sum(chunk_embeds.shape[0]
+                            for chunk_embeds in single_audio_embeds))
+                    return self.get_audio_prompt_texts(audio_len)
+                return self.get_audio_prompt_texts(
+                    len(mm_items["audio"].get(item_idx)))
+
+        return [
+            PromptReplacement(modality=modality,
+                              target=placeholder[modality],
+                              replacement=partial(get_replacement_minicpmv,
+                                                  modality=modality))
+            for modality in ("image", "video", "audio")
+        ]
+
+    def _get_mm_fields_config(
+        self,
+        hf_inputs,
+        hf_processor_mm_kwargs: Mapping[str, object],
+    ) -> Mapping[str, MultiModalFieldConfig]:
+
+        def get_slices(num_slices: List[int]) -> List[int]:
+            slice_indices = [0] + list(accumulate(num_slices))
+            slices = [(slice_indices[i], slice_indices[i + 1])
+                      for i in range(len(num_slices))]
+            return [slice(*slice_item) for slice_item in slices]
+
+        audio_slices = get_slices(
+            hf_inputs.get("audio_num_slices", torch.empty(0)))
+        return dict(
+            **super()._get_mm_fields_config(hf_inputs, hf_processor_mm_kwargs),
+            audio_features=MultiModalFieldConfig.flat("audio", audio_slices),
+            audio_feature_lens=MultiModalFieldConfig.flat(
+                "audio", audio_slices),
+            audio_num_slices=MultiModalFieldConfig.batched("audio"),
+            audio_orders_in_mm_data=MultiModalFieldConfig.batched("audio"),
+            audio_embeds=MultiModalFieldConfig.flat("audio", audio_slices))
+
+
+class MultiModalProjector(nn.Module):
+
+    def __init__(self, in_dim: int, out_dim: int):
+        super().__init__()
+        self.linear1 = nn.Linear(in_features=in_dim,
+                                 out_features=out_dim,
+                                 bias=True)
+        self.relu = nn.ReLU()
+        self.linear2 = nn.Linear(in_features=out_dim,
+                                 out_features=out_dim,
+                                 bias=True)
+
+    def forward(self, audio_features: torch.Tensor) -> torch.Tensor:
+        hidden_states = self.relu(self.linear1(audio_features))
+        hidden_states = self.linear2(hidden_states)
+        return hidden_states
+
+
+class MiniCPMWhisperEncoderLayer(nn.Module):
+
+    def __init__(self, config: WhisperConfig, layer_idx: int = None):
+        super().__init__()
+        self.embed_dim = config.d_model
+        self.self_attn = WHISPER_ATTENTION_CLASSES[
+            config._attn_implementation](
+                embed_dim=self.embed_dim,
+                num_heads=config.encoder_attention_heads,
+                dropout=config.attention_dropout,
+                config=config,
+                layer_idx=layer_idx,
+            )
+        self.self_attn_layer_norm = nn.LayerNorm(self.embed_dim)
+        self.dropout = config.dropout
+        self.activation_fn = ACT2FN[config.activation_function]
+        self.activation_dropout = config.activation_dropout
+        self.fc1 = nn.Linear(self.embed_dim, config.encoder_ffn_dim)
+        self.fc2 = nn.Linear(config.encoder_ffn_dim, self.embed_dim)
+        self.final_layer_norm = nn.LayerNorm(self.embed_dim)
+
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        attention_mask: torch.Tensor,
+    ) -> torch.Tensor:
+        residual = hidden_states
+        past_key_values = None
+        hidden_states = self.self_attn_layer_norm(hidden_states)
+        hidden_states, attn_weights, past_key_values = self.self_attn(
+            hidden_states=hidden_states,
+            attention_mask=attention_mask,
+            past_key_value=past_key_values,
+        )
+        hidden_states = nn.functional.dropout(hidden_states,
+                                              p=self.dropout,
+                                              training=self.training)
+        hidden_states = residual + hidden_states
+
+        residual = hidden_states
+        hidden_states = self.final_layer_norm(hidden_states)
+        hidden_states = self.activation_fn(self.fc1(hidden_states))
+        hidden_states = nn.functional.dropout(hidden_states,
+                                              p=self.activation_dropout,
+                                              training=self.training)
+        hidden_states = self.fc2(hidden_states)
+        hidden_states = nn.functional.dropout(hidden_states,
+                                              p=self.dropout,
+                                              training=self.training)
+        hidden_states = residual + hidden_states
+
+        if hidden_states.dtype == torch.float16 and (
+                torch.isinf(hidden_states).any()
+                or torch.isnan(hidden_states).any()):
+            clamp_value = torch.finfo(hidden_states.dtype).max - 1000
+            hidden_states = torch.clamp(hidden_states,
+                                        min=-clamp_value,
+                                        max=clamp_value)
+
+        outputs = (hidden_states, )
+
+        return outputs
+
+
+class MiniCPMWhisperEncoder(WhisperEncoder):
+
+    def __init__(self, config: WhisperConfig):
+        super().__init__(config)
+        self.layers = nn.ModuleList([
+            MiniCPMWhisperEncoderLayer(config, layer_idx=i)
+            for i in range(config.encoder_layers)
+        ])
+
+    def forward(
+        self,
+        input_features: torch.Tensor,
+        attention_mask: Optional[torch.Tensor] = None,
+    ) -> BaseModelOutputWithPast:
+        # Ignore copy
+        input_features = input_features.to(dtype=self.conv1.weight.dtype,
+                                           device=self.conv1.weight.device)
+
+        inputs_embeds = nn.functional.gelu(self.conv1(input_features))
+        inputs_embeds = nn.functional.gelu(self.conv2(inputs_embeds))
+
+        inputs_embeds = inputs_embeds.permute(0, 2, 1)
+
+        embed_pos = self.embed_positions.weight
+
+        embed_pos = embed_pos[:inputs_embeds.shape[1], :]
+
+        hidden_states = inputs_embeds + embed_pos
+        hidden_states = nn.functional.dropout(hidden_states,
+                                              p=self.dropout,
+                                              training=self.training)
+
+        encoder_states = ()
+
+        for idx, encoder_layer in enumerate(self.layers):
+            encoder_states = encoder_states + (hidden_states, )
+            to_drop = False
+            if self.training:
+                dropout_probability = torch.rand([])
+                if dropout_probability < self.layerdrop:  # skip the layer
+                    to_drop = True
+
+            # Ignore copy
+            if to_drop:
+                layer_outputs = (None, None)
+            else:
+                layer_outputs = encoder_layer(
+                    hidden_states,
+                    attention_mask,
+                )
+
+                hidden_states = layer_outputs[0]
+
+        hidden_states = self.layer_norm(hidden_states)
+        encoder_states = encoder_states + (hidden_states, )
+
+        return BaseModelOutputWithPast(
+            last_hidden_state=hidden_states,
+            hidden_states=encoder_states,
+        )
+
+
+@MULTIMODAL_REGISTRY.register_processor(
+    MiniCPMOMultiModalProcessor,
+    info=MiniCPMOProcessingInfo,
+    dummy_inputs=MiniCPMODummyInputsBuilder)
+class MiniCPMO(MiniCPMV2_6):
+    packed_modules_mapping = {
+        "qkv_proj": [
+            "q_proj",
+            "k_proj",
+            "v_proj",
+        ],
+        "gate_up_proj": [
+            "gate_proj",
+            "up_proj",
+        ],
+    }
+
+    def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
+        super().__init__(vllm_config=vllm_config, prefix=prefix)
+        self.apm = self.init_audio_module(vllm_config=vllm_config,
+                                          prefix=maybe_prefix(prefix, "apm"))
+
+    def init_audio_module(self, *, vllm_config: VllmConfig, prefix: str = ""):
+        # Do not use parameters temporarily
+        audio_config = self.config.audio_config
+        model = MiniCPMWhisperEncoder(audio_config)
+        audio_output_dim = int(audio_config.encoder_ffn_dim // 4)
+        self.audio_avg_pooler = \
+            nn.AvgPool1d(self.config.audio_pool_step,
+                         stride=self.config.audio_pool_step)
+        self.audio_projection_layer = \
+            MultiModalProjector(in_dim=audio_output_dim,out_dim=self.embed_dim)
+        self.audio_encoder_layer = -1
+        return model
+
+    def load_weights(self, weights: Iterable[Tuple[str,
+                                                   torch.Tensor]]) -> Set[str]:
+        loader = AutoWeightsLoader(self, skip_prefixes=["tts"])
+        return loader.load_weights(weights)
+
+    def subsequent_chunk_mask(
+        self,
+        size: int,
+        chunk_size: int,
+        num_left_chunks: int = -1,
+        device: torch.device = CPU_DEVICE,
+        num_lookhead: int = 0,
+    ) -> torch.Tensor:
+        ret = torch.zeros(size, size, device=device, dtype=torch.bool)
+        for i in range(size):
+            if num_left_chunks < 0:
+                start = 0
+            else:
+                start = max((i // chunk_size - num_left_chunks) * chunk_size,
+                            0)
+            ending = min((i // chunk_size + 1) * chunk_size + num_lookhead,
+                         size)
+            ret[i, start:ending] = True
+        return ret
+
+    def _get_feat_extract_output_lengths(self,
+                                         input_lengths: torch.LongTensor):
+        input_lengths_after_cnn = (input_lengths - 1) // 2 + 1
+        input_lengths_after_pooling = (
+            input_lengths_after_cnn -
+            self.config.audio_pool_step) // self.config.audio_pool_step + 1
+        input_lengths_after_pooling = input_lengths_after_pooling.to(
+            dtype=torch.int32)
+
+        return input_lengths_after_cnn, input_lengths_after_pooling
+
+    # Copied from HF repo of MiniCPM-o-2_6,
+    # designed for batched inputs and outputs
+    def get_audio_hidden_states(self, data: MiniCPMOAudioInputs,
+                                chunk_length: int) -> torch.Tensor:
+        wavforms = data.get(
+            "data",
+            [])  # (bs, 80, frames) or [], multi audios need filled in advance
+        audio_feature_lens_raw = [data.get("audio_feature_lens",
+                                           [])]  # list, [[x1, x2], [y1], [z1]]
+
+        # exist audio
+        if len(wavforms) > 0:
+            audio_feature_lens = torch.hstack(audio_feature_lens_raw)
+            batch_size, _, max_mel_seq_len = wavforms.shape
+            max_seq_len = (max_mel_seq_len - 1) // 2 + 1
+
+            # Create a sequence tensor of shape (batch_size, max_seq_len)
+            seq_range = (torch.arange(
+                0,
+                max_seq_len,
+                dtype=audio_feature_lens.dtype,
+                device=audio_feature_lens.device).unsqueeze(0).expand(
+                    batch_size, max_seq_len))
+            lengths_expand = audio_feature_lens.unsqueeze(1).expand(
+                batch_size, max_seq_len)
+            # Create mask
+            padding_mask = seq_range >= lengths_expand  # 1 for padded values
+
+            audio_attention_mask_ = padding_mask.view(
+                batch_size, 1, 1, max_seq_len).expand(batch_size, 1,
+                                                      max_seq_len, max_seq_len)
+            audio_attention_mask = audio_attention_mask_.to(
+                dtype=self.apm.conv1.weight.dtype,
+                device=self.apm.conv1.weight.device)
+
+            if chunk_length > 0:
+                chunk_num_frame = int(chunk_length * 50)
+                chunk_mask = self.subsequent_chunk_mask(
+                    size=max_seq_len,
+                    chunk_size=chunk_num_frame,
+                    num_left_chunks=-1,
+                    device=audio_attention_mask_.device,
+                )
+                audio_attention_mask_ = torch.logical_or(
+                    audio_attention_mask_, torch.logical_not(chunk_mask))
+
+            audio_attention_mask[audio_attention_mask_] = float("-inf")
+            audio_states = self.apm(
+                wavforms, attention_mask=audio_attention_mask).hidden_states[
+                    self.audio_encoder_layer]
+            audio_embeds = self.audio_projection_layer(audio_states)
+
+            audio_embeds = audio_embeds.transpose(1, 2)
+            audio_embeds = self.audio_avg_pooler(audio_embeds)
+            audio_embeds = audio_embeds.transpose(1, 2)
+
+            _, feature_lens_after_pooling = \
+                self._get_feat_extract_output_lengths(audio_feature_lens)
+
+            num_audio_tokens = feature_lens_after_pooling
+
+            final_audio_embeds = []
+            idx = 0
+            for i in range(len(audio_feature_lens_raw)):
+                target_audio_embeds = []
+                for _ in range(len(audio_feature_lens_raw[i])):
+                    target_audio_embeds.append(
+                        audio_embeds[idx, :num_audio_tokens[idx], :])
+                    idx += 1
+                final_audio_embeds.append(target_audio_embeds)
+            return final_audio_embeds
+        else:
+            return []
+
+    def get_embedding_with_audios(self, vlm_embedding: torch.Tensor,
+                                  audio_inputs: Optional[MiniCPMOAudioInputs],
+                                  chunk_length: int) -> torch.Tensor:
+        device, dtype = vlm_embedding.device, vlm_embedding.dtype
+        if audio_inputs["type"] == "audio_embeds":
+            audio_embeddings = audio_inputs["data"]
+            audio_embeddings = [
+                audio_embeddings[i].to(device=device, dtype=dtype)
+                for i in range(len(audio_embeddings))
+            ]
+        else:
+            audio_embeddings = self.get_audio_hidden_states(
+                audio_inputs, chunk_length)[0]
+        if audio_embeddings is None or len(audio_embeddings) == 0:
+            return vlm_embedding
+        audio_bounds = audio_inputs["audio_bounds"]
+        if self.config.chunk_input:
+            audio_embs = torch.cat(audio_embeddings, dim=0).to(device=device,
+                                                               dtype=dtype)
+            audio_start_pos = 0
+            for bound in audio_bounds:
+                audio_len = bound[1] - bound[0]
+                vlm_embedding[bound[0]:bound[1]] = audio_embs[
+                    audio_start_pos:audio_start_pos + audio_len, :]
+                audio_start_pos += audio_len
+        else:
+            for embs, bound in zip(audio_embeddings, audio_bounds):
+                audio_indices = torch.arange(bound[0],
+                                             bound[1],
+                                             dtype=torch.long).to(device)
+
+                if embs.shape[0] != len(audio_indices):
+                    raise ValueError(
+                        "Shape mismatch: Trying to assign embeddings "
+                        f"of shape {embs.shape} "
+                        f"to input indices of length {len(audio_indices)}")
+                vlm_embedding[audio_indices] = embs.to(dtype)
+        return vlm_embedding
+
+    def _get_audio_bounds(self, input_ids: torch.Tensor,
+                          audio_start_id: torch.Tensor,
+                          audio_end_id: torch.Tensor) -> torch.Tensor:
+        audio_start_tokens, = torch.where(input_ids == audio_start_id[0])
+        audio_start_tokens += 1
+        audio_end_tokens, = torch.where(input_ids == audio_end_id[0])
+        valid_audio_nums = max(len(audio_start_tokens), len(audio_end_tokens))
+        return torch.hstack([
+            audio_start_tokens[:valid_audio_nums].unsqueeze(-1),
+            audio_end_tokens[:valid_audio_nums].unsqueeze(-1)
+        ])
+
+    def _parse_and_validate_audio_inputs(
+            self, input_ids: torch.Tensor,
+            **kwargs: object) -> Tuple[MiniCPMOAudioInputs]:
+        audio_features = kwargs.pop("audio_features", [])
+        audio_feature_lens = kwargs.pop("audio_feature_lens", [])
+        audio_embeds = kwargs.pop("audio_embeds", None)
+        audio_start_id = kwargs.pop("audio_start_id", None)
+        audio_end_id = kwargs.pop("audio_end_id", None)
+        if audio_embeds is not None:
+            audio_embeds = [
+                audio_embeds[i][j] for i in range(len(audio_embeds))
+                for j in range(len(audio_embeds[i]))
+            ]
+            return MiniCPMOAudioEmbeddingInputs(
+                audio_bounds=self._get_audio_bounds(input_ids, audio_start_id,
+                                                    audio_end_id),
+                data=audio_embeds,
+                type="audio_embeds")
+        if len(audio_features) > 0:
+            audio_features_all = [
+                i.permute(1, 0) for audio_feature in audio_features
+                for i in audio_feature
+            ]
+            audio_features = torch.nn.utils.rnn.pad_sequence(
+                audio_features_all, batch_first=True,
+                padding_value=0.0).permute(0, 2, 1)
+            audio_feature_lens = torch.cat(
+                [item for item in audio_feature_lens])
+
+            return MiniCPMOAudioFeatureInputs(
+                audio_bounds=self._get_audio_bounds(input_ids, audio_start_id,
+                                                    audio_end_id),
+                data=audio_features,
+                audio_feature_lens=audio_feature_lens,
+                type="audio_features")
+        return None
+
+    def _parse_and_validate_inputs(self, input_ids: torch.Tensor,
+                                   **kwargs: object):
+        image_inputs = self._parse_and_validate_image_inputs(
+            input_ids, **kwargs)
+        if not any("audio" in key for key in kwargs):
+            return image_inputs, None
+        audio_inputs = self._parse_and_validate_audio_inputs(
+            input_ids, **kwargs)
+        return image_inputs, audio_inputs
+
+    def forward(
+        self,
+        input_ids: torch.Tensor,
+        positions: torch.Tensor,
+        kv_caches: List[torch.Tensor],
+        attn_metadata: AttentionMetadata,
+        intermediate_tensors: Optional[IntermediateTensors] = None,
+        **kwargs: Any,
+    ) -> torch.Tensor:
+        if intermediate_tensors is not None:
+            vlm_embeddings = None
+        else:
+            image_inputs, audio_inputs = \
+                self._parse_and_validate_inputs(input_ids, **kwargs)
+            vlm_embeddings, _ = self.get_embedding_with_vision(
+                input_ids, image_inputs)
+
+            if audio_inputs is not None:
+                vlm_embeddings = self.get_embedding_with_audios(
+                    vlm_embeddings, audio_inputs,
+                    self.config.audio_chunk_length)
+
+        # always pass the input via `inputs_embeds`
+        # to make sure the computation graph is consistent
+        # for `torch.compile` integration
+        input_ids = None
+
+        output = self.llm.model(
+            input_ids=input_ids,
+            positions=positions,
+            kv_caches=kv_caches,
+            attn_metadata=attn_metadata,
+            intermediate_tensors=intermediate_tensors,
+            inputs_embeds=vlm_embeddings,
+        )
+        return output
diff --git a/vllm/model_executor/models/minicpmv.py b/vllm/model_executor/models/minicpmv.py
index 1aa529056893b..6964d6bdce9f7 100644
--- a/vllm/model_executor/models/minicpmv.py
+++ b/vllm/model_executor/models/minicpmv.py
@@ -1,3 +1,5 @@
+# SPDX-License-Identifier: Apache-2.0
+
 # Adapted from
 # https://github.com/huggingface/transformers/blob/v4.28.0/src/transformers/models/llama/modeling_llama.py
 # Copyright 2023 The vLLM team.
@@ -22,21 +24,21 @@
 """Inference-only MiniCPM-V model compatible with HuggingFace weights."""
 import math
 import re
+from collections import Counter
 from functools import cached_property, partial
-from typing import (Any, Callable, Iterable, List, Literal, Mapping, Optional,
-                    Set, Tuple, TypedDict, Union)
+from itertools import accumulate
+from typing import (Any, Callable, Dict, Iterable, List, Literal, Mapping,
+                    Optional, Set, Tuple, TypedDict, Union)
 
+import numpy as np
 import torch
 import torch.types
 from PIL import Image
 from torch import nn
-from transformers import PretrainedConfig
-from typing_extensions import NotRequired
+from transformers import BatchFeature, PretrainedConfig
 
 from vllm.attention import AttentionMetadata
 from vllm.config import VllmConfig
-from vllm.inputs import (INPUT_REGISTRY, DecoderOnlyInputs, DummyData,
-                         InputContext, token_inputs)
 from vllm.model_executor.layers.quantization import QuantizationConfig
 from vllm.model_executor.layers.resampler import (BaseResampler, Resampler2,
                                                   get_2d_sincos_pos_embed)
@@ -48,33 +50,30 @@ from vllm.model_executor.models.module_mapping import MultiModelKeys
 from vllm.model_executor.models.qwen2 import Qwen2ForCausalLM
 from vllm.model_executor.sampling_metadata import SamplingMetadata
 from vllm.multimodal import MULTIMODAL_REGISTRY, MultiModalKwargs
-from vllm.multimodal.image import cached_get_image_processor
-from vllm.multimodal.utils import cached_get_tokenizer
-from vllm.sequence import IntermediateTensors, SequenceData
+from vllm.multimodal.inputs import (MultiModalDataDict, MultiModalFieldConfig,
+                                    MultiModalInputs, PlaceholderRange)
+from vllm.multimodal.parse import (ImageItem, ImageSize, ModalityData,
+                                   ModalityDataItems, MultiModalDataItems,
+                                   MultiModalDataParser, VideoItem)
+from vllm.multimodal.processing import (BaseMultiModalProcessor,
+                                        BaseProcessingInfo, PromptReplacement)
+from vllm.multimodal.profiling import BaseDummyInputsBuilder, ProcessorInputs
+from vllm.sequence import IntermediateTensors
 
 from .idefics2_vision_model import Idefics2VisionTransformer
 from .interfaces import SupportsLoRA, SupportsMultiModal, SupportsPP
 from .utils import AutoWeightsLoader, maybe_prefix
 
+CPU_DEVICE = torch.device("cpu")
+
 RawImageType = Union[Image.Image, torch.Tensor]
 
 
-class MiniCPMVRawImageInput(TypedDict):
-    """Input mapper input with auxiliary data for computing image bounds."""
-    image: RawImageType
-
-    # Image bounds token ids in 0-dim scaler tensor.
-    im_start_id: torch.Tensor
-    im_end_id: torch.Tensor
-    slice_start_id: NotRequired[torch.Tensor]
-    slice_end_id: NotRequired[torch.Tensor]
-
-
 class MiniCPMVImagePixelInputs(TypedDict):
     type: Literal["pixel_values"]
     data: List[torch.Tensor]
     """
-    Shape: `(batch_size * num_images, num_channels, height, width)`
+    Shape: `(batch_size * num_images * num_slices, num_channels, height, width)`
 
     Note that the image size may vary, so we pass it as a list
     instead of a batched tensor.
@@ -82,14 +81,14 @@ class MiniCPMVImagePixelInputs(TypedDict):
 
     image_bounds: torch.Tensor
     """
-    Shape: `(batch_size * num_images, 2)`
+    Shape: `(batch_size * num_images * num_slices, 2)`
 
     This should be in `(start, stop)` format.
     """
 
     tgt_sizes: torch.Tensor
     """
-    Shape: `(batch_size * num_images, 2)`
+    Shape: `(batch_size * num_images * num_slices, 2)`
 
     This should be in `(height, width)` format.
     """
@@ -99,7 +98,8 @@ class MiniCPMVImageEmbeddingInputs(TypedDict):
     type: Literal["image_embeds"]
     data: torch.Tensor
     """
-    Shape: `(batch_size * num_images, image_feature_size, hidden_size)`
+    Shape: `(batch_size * num_images * num_slices, 
+             image_feature_size, hidden_size)`
 
     `hidden_size` must match the hidden size of language model backbone.
     instead of a batched tensor.
@@ -107,7 +107,7 @@ class MiniCPMVImageEmbeddingInputs(TypedDict):
 
     image_bounds: torch.Tensor
     """
-    Shape: `(batch_size * num_images, 2)`
+    Shape: `(batch_size * num_images * num_slices, 2)`
 
     This should be in `(start, stop)` format.
     """
@@ -116,6 +116,93 @@ class MiniCPMVImageEmbeddingInputs(TypedDict):
 MiniCPMVImageInputs = Union[MiniCPMVImagePixelInputs,
                             MiniCPMVImageEmbeddingInputs]
 
+
+class MiniCPMVEmbeddingItems(ModalityDataItems[dict[str, torch.Tensor],
+                                               dict[str, torch.Tensor]]):
+
+    def __init__(self, data: Dict, modality: str) -> None:
+        super().__init__(data, modality)
+
+    def get_processor_data(self) -> Mapping[str, object]:
+        return self.data
+
+    def get_passthrough_data(self) -> Mapping[str, object]:
+        return {}
+
+    def get_count(self) -> int:
+        return len(self.data[f"{self.modality}_embeds"])
+
+    def get(self, index: int) -> Dict[str, torch.Tensor]:
+        out = {}
+        for k, v in self.data.items():
+            out[k] = v[index]
+        return out
+
+
+class MiniCPMVImageEmbeddingItems(MiniCPMVEmbeddingItems):
+
+    def __init__(self, data: Dict) -> None:
+        super().__init__(data, "image")
+        image_embeds = self.data.get("image_embeds", None)
+        image_sizes = self.data.get("image_sizes", None)
+        if image_embeds is None:
+            raise ValueError("In correct type of image_embeds",
+                             "Got type: None")
+        if not isinstance(image_embeds[0], torch.Tensor):
+            raise ValueError("In correct type of image_embeds",
+                             f"Got type: {type(image_embeds[0])}")
+        if image_sizes is None:
+            raise ValueError(
+                "In correct type of image_sizes", "Got type: None."
+                "If you're using `image_size_list`, "
+                "please rename it to `image_sizes`")
+        if len(image_embeds[0].shape) == 2:
+            image_embeds = [image_embeds]
+            image_sizes = [image_sizes]
+        self.data["image_embeds"] = image_embeds
+        self.data["image_sizes"] = image_sizes
+
+    def get_image_size(self, index: int) -> ImageSize:
+        image_size = self.data["image_sizes"][index]
+        return ImageSize(width=image_size[0], height=image_size[1])
+
+
+class MiniCPMVVideoEmbeddingItems(MiniCPMVEmbeddingItems):
+
+    def __init__(self, data: Dict) -> None:
+        super().__init__(data, "video")
+        video_embeds = self.data.get("video_embeds", None)
+        image_sizes = self.data.get("image_sizes", None)
+        num_frames = self.data.get("num_frames", None)
+        if video_embeds is None:
+            raise ValueError("In correct type of video_embeds",
+                             "Got type: None")
+        if not isinstance(video_embeds[0], torch.Tensor):
+            raise ValueError("In correct type of video_embeds",
+                             f"Got type: {type(video_embeds[0])}")
+        if image_sizes is None:
+            raise ValueError(
+                "In correct type of image_sizes", "Got type: None."
+                "If you're using `image_size_list`, "
+                "please rename it to `image_sizes`")
+        if num_frames is None:
+            raise ValueError("In correct type of numframes", "Got type: None")
+        if len(video_embeds[0].shape) == 2:
+            video_embeds = [video_embeds]
+            image_sizes = [image_sizes]
+            num_frames = [num_frames]
+        self.data["video_embeds"] = video_embeds
+        self.data["image_sizes"] = image_sizes
+        self.data["num_frames"] = num_frames
+
+    def get_frame_size(self, index: int) -> ImageSize:
+        frame_size = self.data["image_sizes"][index]
+        return ImageSize(width=frame_size[0], height=frame_size[1])
+
+    def get_num_frames(self, index: int) -> int:
+        return self.data["num_frames"][index]
+
+
 DEFAULT_LN = partial(nn.LayerNorm, eps=1e-6)
 
 
@@ -212,25 +299,6 @@ class Resampler2_5(BaseResampler):
         return x
 
 
-def _build_image_input(ctx: InputContext,
-                       image: RawImageType) -> MiniCPMVRawImageInput:
-    tokenizer = cached_get_tokenizer(
-        ctx.model_config.tokenizer,
-        trust_remote_code=ctx.model_config.trust_remote_code)
-    if hasattr(tokenizer, "slice_start_id"):
-        return MiniCPMVRawImageInput(
-            image=image,
-            im_start_id=torch.tensor(tokenizer.im_start_id),
-            im_end_id=torch.tensor(tokenizer.im_end_id),
-            slice_start_id=torch.tensor(tokenizer.slice_start_id),
-            slice_end_id=torch.tensor(tokenizer.slice_end_id))
-    else:
-        return MiniCPMVRawImageInput(
-            image=image,
-            im_start_id=torch.tensor(tokenizer.im_start_id),
-            im_end_id=torch.tensor(tokenizer.im_end_id))
-
-
 def get_version_by_config(config: PretrainedConfig) -> Tuple[int, ...]:
     version_float = getattr(config, "version", None)
 
@@ -240,129 +308,512 @@ def get_version_by_config(config: PretrainedConfig) -> Tuple[int, ...]:
         if config.hidden_size == 2304 and config.query_num == 64:
             return (2, 0)
         return (2, 5)
-
     version_str = str(version_float)
     return tuple(int(x) for x in version_str.split("."))
 
 
-def get_max_minicpmv_image_tokens(ctx: InputContext):
-    hf_config = ctx.get_hf_config()
-    return getattr(hf_config, "query_num", 64)
+class MiniCPMVMultiModalDataParser(MultiModalDataParser):
+
+    def _parse_image_data(
+        self,
+        data: Union[dict[str, torch.Tensor], ModalityData[ImageItem]],
+    ) -> ModalityDataItems[Any, Any]:
+        if isinstance(data, dict):
+            return MiniCPMVImageEmbeddingItems(data)
+        return super()._parse_image_data(data)
+
+    def _parse_video_data(
+        self,
+        data: Union[dict[str, torch.Tensor], ModalityData[VideoItem]],
+    ) -> ModalityDataItems[Any, Any]:
+        if isinstance(data, dict):
+            return MiniCPMVVideoEmbeddingItems(data)
+        return super()._parse_video_data(data)
 
 
-def dummy_seq_data_for_minicpmv(seq_len: int, num_images: int):
-    return SequenceData.from_prompt_token_counts((0, seq_len))
+class MiniCPMVProcessingInfo(BaseProcessingInfo):
+    image_pattern = "(<image>./</image>)"
+    video_pattern = "(<video>./</video>)"
+
+    def get_hf_config(self):
+        return self.ctx.get_hf_config()
+
+    def get_hf_processor(
+        self,
+        **kwargs: object,
+    ):
+        hf_processor = self.ctx.get_hf_processor()
+        return hf_processor
+
+    def get_image_processor(self):
+        hf_processor = self.get_hf_processor()
+        image_processor = hf_processor.image_processor  # type: ignore
+        return image_processor
+
+    def get_model_version(self):
+        return get_version_by_config(self.get_hf_config())
+
+    def get_supported_mm_modalities(self) -> List[str]:
+        if self.get_model_version() == (2, 6):
+            return ["image", "video"]
+        else:
+            return ["image"]
+
+    def get_supported_mm_limits(self) -> Mapping[str, Optional[int]]:
+        if self.get_model_version() == (2, 6):
+            return {"image": None, "video": None}
+        else:
+            return {"image": None}
+
+    def get_mm_max_tokens_per_item(self, seq_len: int) -> Mapping[str, int]:
+        mm_max_tokens = {"image": self.get_max_image_tokens()}
+        if self.get_model_version() == (2, 6):
+            mm_max_tokens["video"] = self.get_max_video_tokens(seq_len)
+        return mm_max_tokens
+
+    def get_max_video_frame_tokens(self) -> int:
+        frame_size = self.get_video_frame_size_with_most_features()
+        return self.get_num_image_tokens(frame_size,
+                                         self.get_video_max_slice_num())
+
+    def get_max_video_tokens(self, seq_len: int) -> int:
+        return self.get_max_video_frame_tokens(
+        ) * self.get_num_frames_with_most_features(seq_len)
+
+    def get_max_audio_tokens(self) -> int:
+        return self.get_max_audio_tokens_per_chunk(
+        ) * self.get_max_audio_chunks_with_most_features()
+
+    def get_slice_query_num(self) -> int:
+        hf_config = self.get_hf_config()
+        query_num = getattr(hf_config, "query_num", 64)
+        return query_num
+
+    def get_max_slice_num(self) -> int:
+        hf_config = self.get_hf_config()
+        max_slice_num = getattr(hf_config, "max_slice_num", 9)
+        return max_slice_num
+
+    def get_sliced_grid(self, image_size: ImageSize,
+                        max_slice_num: int) -> Tuple[int, int]:
+        if self.get_model_version() == (2, 6):
+            slice_grid = self.get_image_processor().get_sliced_grid(
+                image_size, max_slice_num)
+        else:
+            slice_grid = self.get_image_processor().get_sliced_grid(image_size)
+        return slice_grid
+
+    def get_num_image_tokens(self, image_size: ImageSize,
+                             max_slice_num: int) -> int:
+        slice_grid = self.get_sliced_grid(image_size, max_slice_num)
+        num_tokens = self.get_slice_query_num(
+        ) + 2  # <image>(<unk> * query_num)</image>
+        if slice_grid is not None:
+            if self.get_model_version() == (2, 6):
+                num_additional_tokens = 0
+            else:
+                # <slice><image>(<unk> * query_num)</image></slice>
+                num_additional_tokens = 2
+            num_tokens += ((self.get_slice_query_num() + 2) \
+                            * slice_grid[0] * slice_grid[1]) \
+                            + slice_grid[1] - 1 + num_additional_tokens
+        return num_tokens
+
+    def get_image_slice_nums(self, image_size: torch.Tensor,
+                             max_slice_nums: int) -> int:
+        grid = self.get_sliced_grid(image_size, max_slice_nums)
+        return 1 if grid is None else grid[0] * grid[1] + 1
+
+    def get_max_image_tokens(self) -> int:
+        image_size = self.get_image_size_with_most_features()
+        return self.get_num_image_tokens(image_size, self.get_max_slice_num())
+
+    def get_image_size_with_most_features(self) -> ImageSize:
+        # Result in the max possible feature size (h:w = 9:1)
+        return self.get_default_image_sizes(self.get_max_slice_num())
+
+    def get_video_max_slice_num(self) -> int:
+        return 1
+
+    def get_video_frame_size_with_most_features(self) -> ImageSize:
+        return self.get_default_image_sizes(self.get_video_max_slice_num())
+
+    def get_max_video_frames(self, max_tokens: int) -> int:
+        num_frame_tokens = self.get_max_video_frame_tokens()
+        num_frames = max_tokens // num_frame_tokens
+        return num_frames
+
+    def get_num_frames_with_most_features(self, seq_len: int) -> int:
+        mm_config = self.ctx.get_mm_config()
+        max_images = mm_config.limit_per_prompt.get("image", 1)
+        max_videos = mm_config.limit_per_prompt.get("video", 1)
+
+        # count <image_idx></image_idx> tokens
+        # which are not in get_max_image_tokens
+        max_image_tokens = self.get_max_image_tokens(
+        ) * max_images + 4 * max_images
+        max_total_frames = self.get_max_video_frames(seq_len -
+                                                     max_image_tokens)
+
+        num_frames = max(max_total_frames // max(max_videos, 1), 1)
+
+        return num_frames
+
+    def get_default_image_sizes(self, num_slices: int) -> ImageSize:
+        image_size = getattr(self.get_hf_config(), "image_size", 448)
+        return ImageSize(width=image_size, height=image_size * num_slices)
 
 
-def dummy_image_for_minicpmv(ctx: InputContext, hf_config: PretrainedConfig,
-                             num_images: int):
-    width = height = hf_config.image_size
-    image = _build_image_input(ctx,
-                               image=Image.new("RGB", (width, height),
-                                               color=0))
-    return {"image": [image] if num_images == 1 else [image] * num_images}
+class MiniCPMVDummyInputsBuilder(BaseDummyInputsBuilder[MiniCPMVProcessingInfo]
+                                 ):
+
+    def get_dummy_processor_inputs(
+        self,
+        seq_len: int,
+        mm_counts: Mapping[str, int],
+    ) -> ProcessorInputs:
+        num_images = mm_counts.get("image", 0)
+        num_videos = mm_counts.get("video", 0)
+
+        image_width, image_height = \
+            self.info.get_image_size_with_most_features()
+        video_width, video_height = \
+            self.info.get_video_frame_size_with_most_features()
+        num_video_frames = \
+            self.info.get_num_frames_with_most_features(seq_len)
+
+        mm_data = {
+            "image":
+            self._get_dummy_images(width=image_width,
+                                   height=image_height,
+                                   num_images=num_images),
+            "video": [
+                self._get_dummy_images(width=video_width,
+                                       height=video_height,
+                                       num_images=num_video_frames)
+            ] * num_videos,
+        }
+
+        image_prompt_texts = self.info.image_pattern * num_images
+        video_prompt_texts = self.info.video_pattern * num_videos
+
+        return ProcessorInputs(prompt_text=image_prompt_texts +
+                               video_prompt_texts,
+                               mm_data=mm_data)
 
 
-def dummy_data_for_minicpmv(ctx: InputContext, seq_len: int,
-                            mm_counts: Mapping[str, int]):
-    hf_config = ctx.get_hf_config()
-    num_images = mm_counts["image"]
+class MiniCPMVMultiModalProcessor(
+        BaseMultiModalProcessor[MiniCPMVProcessingInfo]):
 
-    seq_data = dummy_seq_data_for_minicpmv(seq_len, num_images)
-    mm_data = dummy_image_for_minicpmv(ctx, hf_config, num_images)
+    def _get_data_parser(self) -> MultiModalDataParser:
+        return MiniCPMVMultiModalDataParser()
 
-    return DummyData(seq_data, mm_data)
-
-
-def input_processor_for_minicpmv(ctx: InputContext, inputs: DecoderOnlyInputs):
-    multi_modal_data = inputs.get("multi_modal_data")
-    if multi_modal_data is None or "image" not in multi_modal_data:
-        return inputs
-    model_config = ctx.model_config
-    version = get_version_by_config(model_config.hf_config)
-    tokenizer = cached_get_tokenizer(
-        model_config.tokenizer,
-        trust_remote_code=model_config.trust_remote_code)
-    image_processor = cached_get_image_processor(model_config.tokenizer)
-
-    def get_placeholder(image_size: Tuple[int, int], num_image: int):
+    def get_slice_image_placeholder(self, image_size: ImageSize,
+                                    **kwargs) -> str:
+        image_processor = self.info.get_image_processor()
+        version = self.info.get_model_version()
         if version == (2, 0) or version == (2, 5):
             return image_processor.get_slice_image_placeholder(image_size)
         return image_processor.get_slice_image_placeholder(
-            image_size, num_image)
+            image_size, **kwargs)
 
-    prompt = inputs.get("prompt")
-    token_ids = inputs.get("prompt_token_ids")
-    if prompt is None:
-        prompt = tokenizer.decode(token_ids)
+    def get_image_prompt_texts(self,
+                               image_size: ImageSize,
+                               image_idx: int = 0) -> str:
+        prompt_texts = self.get_slice_image_placeholder(image_size,
+                                                        image_idx=image_idx)
+        return prompt_texts
 
-    pattern = "(<image>./</image>)"
-    images = multi_modal_data["image"]
-    image_tags = re.findall(pattern, prompt)
-    if len(image_tags) == 0:
-        new_token_ids = token_ids
-        new_prompt = prompt
-    else:
-        if isinstance(images, dict):
-            image_size_list = images.get("image_size_list")
-            images = [images.get("image_embeds")]
-        else:
-            if isinstance(images, Image.Image):
-                images = [images]
-            image_size_list = [image.size for image in images]
+    def get_video_prompt_texts(self, image_size: ImageSize,
+                               num_frames: int) -> str:
+        prompt_texts = "".join(
+            self.get_slice_image_placeholder(
+                image_size=image_size,
+                image_idx=0,
+                max_slice_nums=self.info.get_video_max_slice_num(),
+                use_image_id=False) for image_idx in range(num_frames))
+        return prompt_texts
 
-        text_chunks = prompt.split(pattern)
-        new_prompt_chunks: List[str] = []
-        for i in range(len(image_size_list)):
-            new_prompt_chunks += [
-                text_chunks[i],
-                get_placeholder(image_size_list[i], i)
-            ]
-        new_prompt_chunks.append(text_chunks[-1])
-        new_prompt = "".join(new_prompt_chunks)
-        new_token_ids = tokenizer.encode(new_prompt)
-
-    multi_modal_data["image"] = [
-        _build_image_input(ctx, image) for image in images
-    ]
-
-    return token_inputs(
-        prompt_token_ids=new_token_ids,
-        prompt=new_prompt,
-        multi_modal_data=multi_modal_data,
-    )
-
-
-def input_mapper_for_minicpmv(ctx: InputContext, data: object):
-    model_config = ctx.model_config
-
-    image_processor = cached_get_image_processor(
-        model_config.model, trust_remote_code=model_config.trust_remote_code)
-    if image_processor is None:
-        raise RuntimeError("No HuggingFace processor is available "
-                           "to process the image object")
-
-    if not isinstance(data, list):
-        raise ValueError(
-            "Image input must be list of MiniCPMVImageInput, got (%s)", data)
-
-    if len(data) > 0 and isinstance(data[0]['image'], torch.Tensor):
-        batch_data = {
-            "image_embeds": data[0]['image'],
+    def get_special_tokens(self) -> Dict[str, torch.Tensor]:
+        tokenizer = self.info.get_tokenizer()
+        special_tokens = {
+            "im_start_id": torch.tensor(tokenizer.im_start_id),
+            "im_end_id": torch.tensor(tokenizer.im_end_id)
         }
-    else:
-        batch_data = image_processor \
-            .preprocess([img["image"] for img in data], return_tensors="pt") \
-            .data
+        if hasattr(tokenizer, "slice_start_id"):
+            special_tokens["slice_start_id"] = torch.tensor(
+                tokenizer.slice_start_id)
+            special_tokens["slice_end_id"] = torch.tensor(
+                tokenizer.slice_end_id)
+        return special_tokens
 
-    if len(data) > 0:
-        batch_data["im_start_id"] = data[0]["im_start_id"]
-        batch_data["im_end_id"] = data[0]["im_end_id"]
-        if "slice_start_id" in data[0]:
-            batch_data["slice_start_id"] = data[0]["slice_start_id"]
-            batch_data["slice_end_id"] = data[0]["slice_end_id"]
+    @staticmethod
+    def repack_processor_outputs(outputs: Any) -> BatchFeature:
+        valid_keys = ["pixel_values", "image_sizes", "tgt_sizes"]
+        outputs = {key: outputs[key][0] for key in valid_keys}
+        return outputs
 
-    return MultiModalKwargs(batch_data)
+    def process_images(self, mm_data: Mapping[str, object],
+                       mm_kwargs: Mapping[str, object]) -> Dict[str, object]:
+        images = mm_data.pop("images", [])
+        image_embeds = mm_data.pop("image_embeds", [])
+        if isinstance(images, Image.Image):
+            images = [images]
+        if isinstance(images, (list, torch.Tensor)) and len(images) > 0:
+            image_outputs = super()._call_hf_processor(
+                prompt=self.info.image_pattern * len(images),
+                mm_data={"images": images},
+                mm_kwargs=mm_kwargs)
+            image_outputs = MiniCPMVMultiModalProcessor.\
+                repack_processor_outputs(image_outputs)
+        elif len(image_embeds) > 0:
+            image_sizes = mm_data.pop("image_sizes", None)
+            image_outputs = {
+                "image_embeds": torch.cat(image_embeds),
+                "image_sizes": image_sizes
+            }
+        else:
+            image_outputs = {}
+        return image_outputs
+
+    def process_videos(self, mm_data: Mapping[str, object],
+                       mm_kwargs: Mapping[str, object]) -> Dict[str, object]:
+        videos = mm_data.pop("videos", [])
+        video_embeds = mm_data.pop("video_embeds", [])
+        if len(videos) > 0 and isinstance(videos[0], Image.Image):
+            videos = [videos]
+        if isinstance(videos, list) and len(videos) > 0:
+            video_outputs = {
+                "video_pixel_values": [],
+                "video_image_sizes": [],
+                "video_tgt_sizes": [],
+                "num_frames": []
+            }
+            for video in videos:
+                parsed_video = []
+                for frame in video:
+                    if isinstance(frame, np.ndarray):
+                        parsed_video.append(Image.fromarray(frame))
+                    else:
+                        parsed_video.append(frame)
+                video = parsed_video
+                single_video_outputs = super()._call_hf_processor(
+                    prompt=self.info.image_pattern * len(video),
+                    mm_data={"images": video},
+                    mm_kwargs={
+                        **mm_kwargs, "max_slice_nums":
+                        self.info.get_video_max_slice_num()
+                    })
+                video_outputs["num_frames"].append(len(video))
+                for key in single_video_outputs:
+                    if "video_" + key in video_outputs:
+                        if key == "image_sizes":
+                            video_outputs["video_" + key].append(
+                                single_video_outputs[key][0][0])
+                        else:
+                            video_outputs["video_" +
+                                          key] += single_video_outputs[key][0]
+        elif len(video_embeds):
+            image_sizes = mm_data.pop("image_sizes", None)
+            num_frames = mm_data.pop("num_frames", None)
+            video_outputs = {
+                "video_embeds": torch.cat(video_embeds),
+                "video_image_sizes": image_sizes,
+                "num_frames": num_frames
+            }
+        else:
+            video_outputs = {}
+        return video_outputs
+
+    def get_placeholder_match_pattern(self) -> str:
+        return r"\(<(image|video)>./</\1>\)"
+
+    def get_placeholder_split_pattern(self) -> str:
+        return r"\(<(?:image|video)>./</(?:image|video)>\)"
+
+    def process_mm_inputs(self, mm_data, mm_kwargs) -> object:
+        return {
+            "image": self.process_images(mm_data, mm_kwargs),
+            "video": self.process_videos(mm_data, mm_kwargs)
+        }
+
+    def get_input_modalities(self, mm_data) -> List[str]:
+        supported_mm_modalities = self.info.get_supported_mm_modalities()
+        input_modalities = []
+        for modality in supported_mm_modalities:
+            if modality in mm_data and mm_data[modality] != {}:
+                input_modalities.append(modality)
+        return input_modalities
+
+    def get_modality_num_counter(self, modality: str) -> str:
+        if modality == "image":
+            return "image_sizes"
+        elif modality == "video":
+            return "video_image_sizes"
+
+    def get_num_slices_by_modality(self, inputs: Dict[str, object],
+                                   modality: str, index: int) -> int:
+        if modality == "image":
+            return self.info.get_image_slice_nums(
+                inputs[modality]["image_sizes"][index],
+                self.info.get_max_slice_num())
+        elif modality == "video":
+            return self.info.get_image_slice_nums(
+                inputs[modality]["video_image_sizes"][index],
+                self.info.get_video_max_slice_num()
+            ) * inputs[modality]["num_frames"][index]
+        else:
+            raise ValueError(f"UnExpected modality: {modality}")
+
+    def check_mm_inputs(self, inputs: Dict[str, object],
+                        matches: List[str]) -> None:
+        counts = Counter(matches)
+        for modality, count in counts.items():
+            if modality not in inputs or not inputs[modality]:
+                raise ValueError(f"None input data of {modality}."
+                                 "But prompt requires.")
+            counter_key = self.get_modality_num_counter(modality)
+            if len(inputs[modality][counter_key]) != count:
+                raise ValueError(f"The prompt requires {count} "
+                                 f"{modality} inputs while you pass "
+                                 f"{len(inputs[modality][counter_key])}")
+
+    def get_prompt_texts_by_modality(self, inputs: Dict[str, object],
+                                     modality: str, index: int) -> str:
+        if modality == "image":
+            return self.get_image_prompt_texts(
+                inputs["image"]["image_sizes"][index], index)
+        elif modality == "video":
+            return self.get_video_prompt_texts(
+                inputs["video"]["video_image_sizes"][index],
+                inputs["video"]["num_frames"][index])
+        else:
+            raise ValueError(f"UnExpected modality: {modality}")
+
+    def call_base_hf_processor(
+        self,
+        prompt: str,
+        mm_data: Mapping[str, object],
+        mm_kwargs: Mapping[str, object],
+    ) -> BatchFeature:
+        return super()._call_hf_processor(prompt=prompt,
+                                          mm_data=mm_data,
+                                          mm_kwargs=mm_kwargs)
+
+    def _call_hf_processor(
+        self,
+        prompt: str,
+        mm_data: Mapping[str, object],
+        mm_kwargs: Mapping[str, object],
+    ) -> BatchFeature:
+        # Do not support combination inputs of images and videos for now
+        # Try to handle interleaved multimodal data
+        tokenizer = self.info.get_tokenizer()
+        inputs = self.process_mm_inputs(mm_data, mm_kwargs)
+        mm_input_modalities = self.get_input_modalities(inputs)
+        num_mm_slices = {modality: [] for modality in mm_input_modalities}
+        for modality in mm_input_modalities:
+            num_counter_key = self.get_modality_num_counter(modality)
+            for index in range(len(inputs[modality][num_counter_key])):
+                num_mm_slices[modality].append(
+                    self.get_num_slices_by_modality(inputs, modality, index))
+        return {
+            "input_ids": np.array([tokenizer.encode(prompt)]),
+            **{
+                key: value
+                for modality in inputs
+                for key, value in inputs[modality].items()
+            },
+            **{
+                f"{modality}_num_slices": num_mm_slices[modality]
+                for modality in mm_input_modalities
+            }
+        }
+
+    def _get_prompt_replacements(
+            self, mm_items: MultiModalDataItems,
+            hf_processor_mm_kwargs: Mapping[str, Any],
+            out_mm_kwargs: MultiModalKwargs) -> List[PromptReplacement]:
+        placeholder = {
+            "image": self.info.image_pattern,
+            "video": self.info.video_pattern,
+        }
+
+        def get_replacement_minicpmv(item_idx: int, modality: str):
+            if modality == "image":
+                return self.get_image_prompt_texts(
+                    mm_items["image"].get_image_size(item_idx), item_idx)
+            else:  # video
+                return self.get_video_prompt_texts(
+                    mm_items["video"].get_frame_size(item_idx),
+                    mm_items["video"].get_num_frames(item_idx))
+
+        return [
+            PromptReplacement(modality=modality,
+                              target=placeholder[modality],
+                              replacement=partial(get_replacement_minicpmv,
+                                                  modality=modality))
+            for modality in ("image", "video")
+        ]
+
+    def _get_mm_fields_config(
+        self,
+        hf_inputs,
+        hf_processor_mm_kwargs: Mapping[str, object],
+    ) -> Mapping[str, MultiModalFieldConfig]:
+
+        def get_slices(num_slices: List[int]) -> List[int]:
+            slice_indices = [0] + list(accumulate(num_slices))
+            slices = [(slice_indices[i], slice_indices[i + 1])
+                      for i in range(len(num_slices))]
+            return [slice(*slice_item) for slice_item in slices]
+
+        image_slices = get_slices(
+            hf_inputs.get("image_num_slices", torch.empty(0)))
+        video_slices = get_slices(
+            hf_inputs.get("video_num_slices", torch.empty(0)))
+
+        return dict(
+            pixel_values=MultiModalFieldConfig.flat("image", image_slices),
+            image_sizes=MultiModalFieldConfig.batched("image"),
+            tgt_sizes=MultiModalFieldConfig.flat("image", image_slices),
+            image_num_slices=MultiModalFieldConfig.batched("image"),
+            image_embeds=MultiModalFieldConfig.flat("image", image_slices),
+            video_pixel_values=MultiModalFieldConfig.flat(
+                "video", video_slices),
+            video_image_sizes=MultiModalFieldConfig.batched("video"),
+            video_tgt_sizes=MultiModalFieldConfig.flat("video", video_slices),
+            video_embeds=MultiModalFieldConfig.flat("video", video_slices),
+            video_num_slices=MultiModalFieldConfig.batched("video"))
+
+    def apply(
+        self,
+        prompt: Union[str, List[int]],
+        mm_data: MultiModalDataDict,
+        hf_processor_mm_kwargs: Mapping[str, object],
+    ) -> MultiModalInputs:
+        supported_mm_modalities = self.info.get_supported_mm_modalities()
+        if isinstance(prompt, list):
+            prompt = self.info.get_tokenizer().decode(prompt)
+        matches = re.findall(self.get_placeholder_match_pattern(), prompt)
+        mm_orders = {
+            f"{modality}_orders":
+            torch.tensor(
+                [index for index, m in enumerate(matches) if m == modality])
+            for modality in supported_mm_modalities
+        }
+        result = super().apply(prompt, mm_data, hf_processor_mm_kwargs)
+        # Exclude <image_id>x</image_id> from placeholders
+        if "image" in result["mm_placeholders"] and \
+            self.info.get_model_version() == (2, 6):
+            result["mm_placeholders"]["image"] = [
+                PlaceholderRange(offset=p["offset"] + 3 + idx // 10,
+                                 length=p["length"] - 3 - idx // 10)
+                for idx, p in enumerate(result["mm_placeholders"]["image"])
+            ]
+        result["mm_kwargs"].update(**mm_orders)
+        result["mm_kwargs"].update(**self.get_special_tokens())
+        return result
 
 
 class MiniCPMVBaseModel(nn.Module, SupportsMultiModal, SupportsPP):
@@ -409,7 +860,7 @@ class MiniCPMVBaseModel(nn.Module, SupportsMultiModal, SupportsPP):
 
         return get_sampler()
 
-    def get_embedding(
+    def get_embedding_with_vision(
         self,
         input_ids: torch.Tensor,
         image_inputs: Optional[MiniCPMVImageInputs],
@@ -471,25 +922,46 @@ class MiniCPMVBaseModel(nn.Module, SupportsMultiModal, SupportsPP):
             image_end_tokens[:valid_image_nums].unsqueeze(-1),
         ])
 
-    def _parse_and_validate_inputs(
+    def _parse_and_validate_image_inputs(
         self,
         input_ids: torch.Tensor,
         **kwargs: object,
     ) -> Optional[MiniCPMVImageInputs]:
-        pixel_values = kwargs.pop("pixel_values", [])
-        tgt_sizes = kwargs.pop("tgt_sizes", [])
+        mm_data = {
+            "image": {
+                key: kwargs.pop(key, [])
+                for key in ["pixel_values", "tgt_sizes", "image_num_slices"]
+            },
+            "video": {
+                "pixel_values": kwargs.pop("video_pixel_values", []),
+                "tgt_sizes": kwargs.pop("video_tgt_sizes", []),
+                "video_num_slices": kwargs.pop("video_num_slices", [])
+            }
+        }
         im_start_id = kwargs.pop("im_start_id", None)
         im_end_id = kwargs.pop("im_end_id", None)
         slice_start_id = kwargs.pop("slice_start_id", None)
         slice_end_id = kwargs.pop("slice_end_id", None)
+        mm_orders = {
+            f"{modality}": kwargs.pop(f"{modality}_orders", None)
+            for modality in ["image", "video", "audio"]
+        }
+        batch_size = max(len(mm_data["image"]["pixel_values"]),
+                         len(mm_data["video"]["pixel_values"]))
         image_embeds = kwargs.pop("image_embeds", None)
-
+        video_embeds = kwargs.pop("video_embeds", None)
+        if image_embeds is not None and video_embeds is not None:
+            raise ValueError(
+                "Incorrect inputs for vision embeddings. "
+                "Image embeds and video embeds can not exist simultaneously.")
+        if video_embeds is not None:
+            image_embeds = video_embeds
         if image_embeds is not None:
             if not isinstance(image_embeds, (torch.Tensor, list)):
                 raise ValueError(f"Incorrect type of image embeds. "
                                  f"Got type: {type(image_embeds)}")
-            if isinstance(image_embeds, list):
-                image_embeds = torch.concat(image_embeds)
+            image_embeds = torch.concat(
+                [image_embeds[i] for i in range(len(image_embeds))])
 
             return MiniCPMVImageEmbeddingInputs(
                 image_bounds=self._get_image_bounds(input_ids, im_start_id,
@@ -498,29 +970,47 @@ class MiniCPMVBaseModel(nn.Module, SupportsMultiModal, SupportsPP):
                 data=image_embeds,
                 type="image_embeds",
             )
+        for modality, modality_mm_data in mm_data.items():
+            if not isinstance(modality_mm_data["pixel_values"],
+                              (torch.Tensor, list)):
+                raise ValueError(
+                    "Incorrect type of pixel values. "
+                    f"Got type: {type(modality_mm_data['pixel_values'])}")
 
-        if not isinstance(pixel_values, (torch.Tensor, list)):
-            raise ValueError("Incorrect type of pixel values. "
-                             f"Got type: {type(pixel_values)}")
+            if not isinstance(modality_mm_data["tgt_sizes"],
+                              (torch.Tensor, list)):
+                raise ValueError(
+                    "Incorrect type of target sizes. "
+                    f"Got type: {type(modality_mm_data['tgt_sizes'])}")
 
-        if not isinstance(tgt_sizes, (torch.Tensor, list)):
-            raise ValueError("Incorrect type of target sizes. "
-                             f"Got type: {type(tgt_sizes)}")
-
-        if len(pixel_values) != len(tgt_sizes):
-            raise ValueError("Inconsistent batch lengths, found: "
-                             f"{len(pixel_values)} vs. {len(tgt_sizes)}")
+            if len(modality_mm_data["pixel_values"]) != len(
+                    modality_mm_data["tgt_sizes"]):
+                raise ValueError(
+                    "Inconsistent batch lengths, found: "
+                    f"{len(modality_mm_data['pixel_values'])} vs. "
+                    f"{len(modality_mm_data['tgt_sizes'])}")
 
         pixel_values_flat: List[torch.Tensor] = []
         tgt_sizes_flat: List[torch.Tensor] = []
-        for pixel_b, tgt_b in zip(pixel_values, tgt_sizes):
-            if len(pixel_b) != len(tgt_b):
-                raise ValueError("Inconsistent N lengths, found: "
-                                 f"{len(pixel_b)} vs {len(tgt_b)}")
-
-            for pixel_n, tgt_n in zip(pixel_b, tgt_b):
-                pixel_values_flat += pixel_n
-                tgt_sizes_flat += tgt_n
+        for b in range(batch_size):
+            mm_counts = {"image": 0, "video": 0} if self.version == (2, 6) \
+                        else {"image": 0}
+            mm_slice_counts = {"image": 0, "video": 0} \
+                               if self.version == (2, 6) else {"image": 0}
+            mm_orders_b = [(index, modality) for modality in mm_counts
+                           for index in mm_orders[modality][b]]
+            for _, modality in sorted(mm_orders_b, key=lambda x: x[0]):
+                pos = mm_counts[modality]
+                num_slices = mm_data[modality][f"{modality}_num_slices"][b][
+                    pos]
+                slice_start_idx = mm_slice_counts[modality]
+                slice_end_idx = slice_start_idx + num_slices
+                pixel_values_flat += mm_data[modality]["pixel_values"][b][
+                    slice_start_idx:slice_end_idx]
+                tgt_sizes_flat += mm_data[modality]["tgt_sizes"][b][
+                    slice_start_idx:slice_end_idx]
+                mm_counts[modality] += 1
+                mm_slice_counts[modality] += num_slices
 
         # NOTE: Input IDs does not contain image tokens during memory profiling,
         # so we allow it to be empty
@@ -544,6 +1034,10 @@ class MiniCPMVBaseModel(nn.Module, SupportsMultiModal, SupportsPP):
             type="pixel_values",
         )
 
+    def _parse_and_validate_inputs(self, input_ids: torch.Tensor,
+                                   **kwargs: object):
+        return self._parse_and_validate_image_inputs(input_ids, **kwargs)
+
     def forward(
         self,
         input_ids: torch.Tensor,
@@ -556,9 +1050,10 @@ class MiniCPMVBaseModel(nn.Module, SupportsMultiModal, SupportsPP):
         if intermediate_tensors is not None:
             vlm_embeddings = None
         else:
-            image_inputs = self._parse_and_validate_inputs(input_ids, **kwargs)
-
-            vlm_embeddings, _ = self.get_embedding(input_ids, image_inputs)
+            image_inputs = \
+                self._parse_and_validate_inputs(input_ids, **kwargs)
+            vlm_embeddings, _ = self.get_embedding_with_vision(
+                input_ids, image_inputs)
 
         # always pass the input via `inputs_embeds`
         # to make sure the computation graph is consistent
@@ -964,15 +1459,15 @@ class MiniCPMV2_6(MiniCPMVBaseModel, SupportsLoRA):
 _SUPPORT_VERSION = {
     (2, 0): MiniCPMV2_0,
     (2, 5): MiniCPMV2_5,
-    (2, 6): MiniCPMV2_6
+    (2, 6): MiniCPMV2_6,
 }
 
 
-@MULTIMODAL_REGISTRY.register_image_input_mapper(input_mapper_for_minicpmv)
-@MULTIMODAL_REGISTRY.register_max_image_tokens(get_max_minicpmv_image_tokens)
-@INPUT_REGISTRY.register_dummy_data(dummy_data_for_minicpmv)
-@INPUT_REGISTRY.register_input_processor(input_processor_for_minicpmv)
-class MiniCPMV(MiniCPMVBaseModel, SupportsLoRA):
+@MULTIMODAL_REGISTRY.register_processor(
+    MiniCPMVMultiModalProcessor,
+    info=MiniCPMVProcessingInfo,
+    dummy_inputs=MiniCPMVDummyInputsBuilder)
+class MiniCPMV(MiniCPMVBaseModel, SupportsMultiModal, SupportsLoRA):
     """
     Different versions of MiniCPMV use different visual encoders and LLMs,
     which is not conducive to the current integration logic of LoRA and
diff --git a/vllm/model_executor/models/mixtral.py b/vllm/model_executor/models/mixtral.py
index da415cdae96ed..70880eb752246 100644
--- a/vllm/model_executor/models/mixtral.py
+++ b/vllm/model_executor/models/mixtral.py
@@ -1,3 +1,5 @@
+# SPDX-License-Identifier: Apache-2.0
+
 # Adapted from
 # https://github.com/huggingface/transformers/blob/v4.28.0/src/transformers/models/llama/modeling_llama.py
 # Copyright 2023 The vLLM team.
@@ -452,7 +454,11 @@ class MixtralForCausalLM(nn.Module, SupportsLoRA, SupportsPP):
                 # Skip layers on other devices.
                 if is_pp_missing_parameter(name, self):
                     continue
-
+                if name.endswith("scale"):
+                    # Remapping the name of FP8 kv-scale.
+                    name = maybe_remap_kv_scale_name(name, params_dict)
+                    if name is None:
+                        continue
                 param = params_dict[name]
                 weight_loader = param.weight_loader
                 weight_loader(param, loaded_weight, shard_id)
diff --git a/vllm/model_executor/models/mixtral_quant.py b/vllm/model_executor/models/mixtral_quant.py
index 7a9b8cd88cfd0..fdc4389175423 100644
--- a/vllm/model_executor/models/mixtral_quant.py
+++ b/vllm/model_executor/models/mixtral_quant.py
@@ -1,3 +1,5 @@
+# SPDX-License-Identifier: Apache-2.0
+
 # Adapted from
 # https://github.com/huggingface/transformers/blob/v4.28.0/src/transformers/models/llama/modeling_llama.py
 # Copyright 2023 The vLLM team.
diff --git a/vllm/model_executor/models/mllama.py b/vllm/model_executor/models/mllama.py
index 61baa8e588d74..d1cb04cdb242f 100644
--- a/vllm/model_executor/models/mllama.py
+++ b/vllm/model_executor/models/mllama.py
@@ -1,3 +1,5 @@
+# SPDX-License-Identifier: Apache-2.0
+
 # Copyright 2024 the HuggingFace Inc. team. All rights reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
@@ -48,7 +50,8 @@ from vllm.model_executor.layers.quantization import QuantizationConfig
 from vllm.model_executor.layers.sampler import SamplerOutput, get_sampler
 from vllm.model_executor.layers.vocab_parallel_embedding import (
     DEFAULT_VOCAB_PADDING_SIZE, ParallelLMHead, VocabParallelEmbedding)
-from vllm.model_executor.model_loader.weight_utils import default_weight_loader
+from vllm.model_executor.model_loader.weight_utils import (
+    default_weight_loader, maybe_remap_kv_scale_name)
 from vllm.model_executor.sampling_metadata import SamplingMetadata
 from vllm.multimodal import MULTIMODAL_REGISTRY
 from vllm.sequence import SequenceData
@@ -847,7 +850,8 @@ class MllamaTextCrossAttention(nn.Module):
                     i,
                     i,
                 )
-            elif self.attn.backend in (_Backend.XFORMERS, _Backend.TORCH_SDPA):
+            elif self.attn.backend in (_Backend.XFORMERS, _Backend.ROCM_FLASH,
+                                       _Backend.TORCH_SDPA):
                 key_cache, value_cache = PagedAttention.split_kv_cache(
                     kv_cache, self.num_local_key_value_heads, self.head_dim)
                 cached_k = torch.cat([k[s:e] for s, e in kv_range_for_decode])
@@ -859,7 +863,8 @@ class MllamaTextCrossAttention(nn.Module):
                 raise ValueError(
                     f"Unsupported Attention backend {self.attn.backend} "
                     "enum found. Expected the Attention backend to be "
-                    "FLASH_ATTN, FLASH_ATTN_VLLM_V1, XFORMERS or TORCH_SDPA.")
+                    "FLASH_ATTN, FLASH_ATTN_VLLM_V1, "
+                    "XFORMERS or TORCH_SDPA.")
 
         # We have to call torch.sdpa for prefill when using a
         # custom cross-attention mask. Because the mask is not a
@@ -1365,8 +1370,8 @@ class MllamaForConditionalGeneration(nn.Module, SupportsMultiModal):
         # For 1) text-only prefill and decode, 2) image-present decode.
         if image_inputs is None:
             full_text_row_masked_out_mask = (
-                attn_metadata.encoder_seq_lens_tensor != 0).reshape(-1, 1).to(
-                    input_ids.device)
+                attn_metadata.encoder_seq_lens_tensor
+                != 0).reshape(-1, 1).to(input_ids.device)
             skip_cross_attention = max(attn_metadata.encoder_seq_lens) == 0
 
         # For image-present prefill.
@@ -1452,6 +1457,13 @@ class MllamaForConditionalGeneration(nn.Module, SupportsMultiModal):
                 weight_loader(param, loaded_weight, shard_id)
                 break
             else:
+                orig_name = name
+                name = maybe_remap_kv_scale_name(name, params_dict)
+                if name is None:
+                    logger.debug("Missing name %s, orig name %s", name,
+                                 orig_name)
+                    continue
+
                 param = params_dict.pop(name)
                 weight_loader = getattr(param, "weight_loader",
                                         default_weight_loader)
@@ -1485,14 +1497,23 @@ def convert_sparse_cross_attention_mask_to_dense(
     total_length = sum(lengths)
     total_tiles = sum([sum(tiles) for tiles in num_tiles])
     dense_mask = np.zeros(shape=(total_length, total_tiles), dtype=np.int64)
-    # A list of ranges, range[i] = [start, end] means
-    # if the i-th sample has N tiles in total, the tiles[start, end]
-    # will be used for cross-attention decoding.
+    # A list of ranges, range[i] = [start, end] means that the i-th image will
+    # use tiles[start, end] for cross-attention decoding.
     tile_range_for_decode = []
 
     seq_start = 0
     tile_start = 0
-    for masks, tiles, length in zip(sparse_mask, num_tiles, lengths):
+
+    # sparse_mask has an [] entry for each sequence that does not have images,
+    # but num_tiles does not have these entries...
+    num_tiles_idx = 0
+    for masks, length in zip(sparse_mask, lengths):
+        if len(masks) == 0:
+            # Text only
+            continue
+
+        tiles = num_tiles[num_tiles_idx]
+        num_tiles_idx += 1
         ts, td = -1, 0
         for mask, tile in zip(masks, tiles):
             if len(mask) != 2:
@@ -1512,6 +1533,7 @@ def convert_sparse_cross_attention_mask_to_dense(
         assert td != 0
         tile_range_for_decode.append((ts, ts + td))
         seq_start += length
+    assert num_tiles_idx == len(num_tiles)
 
     return dense_mask, tile_range_for_decode
 
diff --git a/vllm/model_executor/models/mlp_speculator.py b/vllm/model_executor/models/mlp_speculator.py
index d49da5f29aa14..cf4123a2c2b69 100644
--- a/vllm/model_executor/models/mlp_speculator.py
+++ b/vllm/model_executor/models/mlp_speculator.py
@@ -1,3 +1,5 @@
+# SPDX-License-Identifier: Apache-2.0
+
 import math
 from typing import Iterable, List, Set, Tuple
 
@@ -81,8 +83,8 @@ class MLPSpeculator(nn.Module):
 
         if self.tie_weights:
             assert (
-                self.n_predict >
-                1), "You cannot tie weights between stages when only 1 exists"
+                self.n_predict > 1
+            ), "You cannot tie weights between stages when only 1 exists"
             embedding = VocabParallelEmbedding(
                 config.vocab_size,
                 self.inner_dim,
diff --git a/vllm/model_executor/models/module_mapping.py b/vllm/model_executor/models/module_mapping.py
index a9102a6073a2f..23814e6322d2e 100644
--- a/vllm/model_executor/models/module_mapping.py
+++ b/vllm/model_executor/models/module_mapping.py
@@ -1,3 +1,5 @@
+# SPDX-License-Identifier: Apache-2.0
+
 # Adapted from
 #  https://github.com/modelscope/ms-swift/blob/v2.4.2/swift/utils/module_mapping.py
 
diff --git a/vllm/model_executor/models/molmo.py b/vllm/model_executor/models/molmo.py
index 5c7ae0deefcd8..b524a14977b16 100644
--- a/vllm/model_executor/models/molmo.py
+++ b/vllm/model_executor/models/molmo.py
@@ -1,3 +1,5 @@
+# SPDX-License-Identifier: Apache-2.0
+
 import math
 import re
 from array import array
diff --git a/vllm/model_executor/models/mpt.py b/vllm/model_executor/models/mpt.py
index 1235816413a44..676c960623edf 100644
--- a/vllm/model_executor/models/mpt.py
+++ b/vllm/model_executor/models/mpt.py
@@ -1,3 +1,5 @@
+# SPDX-License-Identifier: Apache-2.0
+
 # Adapted from https://huggingface.co/mosaicml/mpt-7b/tree/main
 import math
 from typing import Iterable, List, Optional, Set, Tuple, Union
diff --git a/vllm/model_executor/models/nemotron.py b/vllm/model_executor/models/nemotron.py
index 2340283b69665..6f0b831ac2727 100644
--- a/vllm/model_executor/models/nemotron.py
+++ b/vllm/model_executor/models/nemotron.py
@@ -1,3 +1,5 @@
+# SPDX-License-Identifier: Apache-2.0
+
 # Adapted from
 # https://github.com/huggingface/transformers/blob/v4.28.0/src/transformers/models/llama/modeling_llama.py
 # Copyright 2023 The vLLM team.
diff --git a/vllm/model_executor/models/nvlm_d.py b/vllm/model_executor/models/nvlm_d.py
index df4fd0a3256e9..2aa04bd717260 100644
--- a/vllm/model_executor/models/nvlm_d.py
+++ b/vllm/model_executor/models/nvlm_d.py
@@ -1,3 +1,5 @@
+# SPDX-License-Identifier: Apache-2.0
+
 # adapted from https://huggingface.co/nvidia/NVLM-D-72B/blob/main/modeling_nvlm_d.py
 # --------------------------------------------------------
 # NVLM-D
diff --git a/vllm/model_executor/models/olmo.py b/vllm/model_executor/models/olmo.py
index 538e31ec91699..3b470dfdd05be 100644
--- a/vllm/model_executor/models/olmo.py
+++ b/vllm/model_executor/models/olmo.py
@@ -1,3 +1,5 @@
+# SPDX-License-Identifier: Apache-2.0
+
 # Adapted from
 # https://github.com/huggingface/transformers/blob/v4.40.1/src/transformers/models/olmo/modeling_olmo.py
 # Copyright 2024 The vLLM team.
diff --git a/vllm/model_executor/models/olmo2.py b/vllm/model_executor/models/olmo2.py
index a35c911f90d96..4b0455098eedb 100644
--- a/vllm/model_executor/models/olmo2.py
+++ b/vllm/model_executor/models/olmo2.py
@@ -1,3 +1,5 @@
+# SPDX-License-Identifier: Apache-2.0
+
 # Adapted from
 # https://github.com/huggingface/transformers/blob/main/src/transformers/models/olmo2/modeling_olmo2.py
 # Copyright 2024 The vLLM team.
diff --git a/vllm/model_executor/models/olmoe.py b/vllm/model_executor/models/olmoe.py
index fbe5d1aee04b3..d6e24c6d67f37 100644
--- a/vllm/model_executor/models/olmoe.py
+++ b/vllm/model_executor/models/olmoe.py
@@ -1,3 +1,5 @@
+# SPDX-License-Identifier: Apache-2.0
+
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
diff --git a/vllm/model_executor/models/opt.py b/vllm/model_executor/models/opt.py
index ea1185aa80dc6..ad1d66902435b 100644
--- a/vllm/model_executor/models/opt.py
+++ b/vllm/model_executor/models/opt.py
@@ -1,3 +1,5 @@
+# SPDX-License-Identifier: Apache-2.0
+
 # Adapted from
 # https://github.com/huggingface/transformers/blob/v4.28.0/src/transformers/models/opt/modeling_opt.py
 # Copyright 2023 The vLLM team.
diff --git a/vllm/model_executor/models/orion.py b/vllm/model_executor/models/orion.py
index a3757b5c8808e..f4f5cdff64372 100644
--- a/vllm/model_executor/models/orion.py
+++ b/vllm/model_executor/models/orion.py
@@ -1,3 +1,5 @@
+# SPDX-License-Identifier: Apache-2.0
+
 # Adapted from
 # https://huggingface.co/OrionStarAI/Orion-14B-Base/blob/main/modeling_orion.py
 # Copyright (c) OrionStar Inc.
diff --git a/vllm/model_executor/models/paligemma.py b/vllm/model_executor/models/paligemma.py
index 5a28b1ffbb7b4..65d810dc23bc6 100644
--- a/vllm/model_executor/models/paligemma.py
+++ b/vllm/model_executor/models/paligemma.py
@@ -1,3 +1,5 @@
+# SPDX-License-Identifier: Apache-2.0
+
 from typing import (Iterable, List, Literal, Mapping, Optional, Set, Tuple,
                     TypedDict, Union)
 
diff --git a/vllm/model_executor/models/persimmon.py b/vllm/model_executor/models/persimmon.py
index 14dd4b5b1b4da..6a80bea348ea0 100644
--- a/vllm/model_executor/models/persimmon.py
+++ b/vllm/model_executor/models/persimmon.py
@@ -1,3 +1,5 @@
+# SPDX-License-Identifier: Apache-2.0
+
 # adapted from https://github.com/huggingface/transformers/blob/v4.39.3/src/transformers/models/persimmon/modeling_persimmon.py
 # Copyright 2023 The vLLM team.
 # Copyright 2023 EleutherAI and the HuggingFace Inc. team. All rights reserved.
diff --git a/vllm/model_executor/models/phi.py b/vllm/model_executor/models/phi.py
index 59b7508a370f8..6b05bfee94922 100644
--- a/vllm/model_executor/models/phi.py
+++ b/vllm/model_executor/models/phi.py
@@ -1,3 +1,5 @@
+# SPDX-License-Identifier: Apache-2.0
+
 # Adapted from
 # https://huggingface.co/microsoft/phi-1_5/blob/main/modeling_phi.py
 # Copyright 2023 The vLLM team.
diff --git a/vllm/model_executor/models/phi3.py b/vllm/model_executor/models/phi3.py
index 34141511ea791..8f84e0726951d 100644
--- a/vllm/model_executor/models/phi3.py
+++ b/vllm/model_executor/models/phi3.py
@@ -1,3 +1,5 @@
+# SPDX-License-Identifier: Apache-2.0
+
 # Adapted from llama.py
 """Inference-only Phi3 model code inherit from Llama.py"""
 
diff --git a/vllm/model_executor/models/phi3_small.py b/vllm/model_executor/models/phi3_small.py
index f47676b934e4e..a8b7e9b2a5958 100644
--- a/vllm/model_executor/models/phi3_small.py
+++ b/vllm/model_executor/models/phi3_small.py
@@ -1,3 +1,5 @@
+# SPDX-License-Identifier: Apache-2.0
+
 import math
 from typing import Iterable, List, Optional, Set, Tuple, Union
 
diff --git a/vllm/model_executor/models/phi3v.py b/vllm/model_executor/models/phi3v.py
index 0fcda81da2800..f089fa5d295eb 100644
--- a/vllm/model_executor/models/phi3v.py
+++ b/vllm/model_executor/models/phi3v.py
@@ -1,3 +1,5 @@
+# SPDX-License-Identifier: Apache-2.0
+
 # Copyright 2024 The vLLM team.
 # Copyright 2024 Microsoft and the HuggingFace Inc. team. All rights reserved.
 #
diff --git a/vllm/model_executor/models/phimoe.py b/vllm/model_executor/models/phimoe.py
index 881c09ea9db99..aa4bb52c444f7 100644
--- a/vllm/model_executor/models/phimoe.py
+++ b/vllm/model_executor/models/phimoe.py
@@ -1,3 +1,5 @@
+# SPDX-License-Identifier: Apache-2.0
+
 # Adapted from
 # https://github.com/huggingface/transformers/blob/v4.28.0/src/transformers/models/llama/modeling_llama.py
 # Copyright 2023 The vLLM team.
@@ -167,8 +169,8 @@ def sparsemixer(scores, jitter_eps=0.01):
         # compute mask for sparsity
         mask_logits_threshold, max_ind = scores.max(dim=-1, keepdim=True)
         factor = scores.abs().clamp(min=mask_logits_threshold)
-        mask_logits_threshold = (
-            (mask_logits_threshold - scores) / factor) > (2 * jitter_eps)
+        mask_logits_threshold = ((mask_logits_threshold - scores) /
+                                 factor) > (2 * jitter_eps)
 
     # apply mask
     masked_gates = scores.masked_fill(mask_logits_threshold, float("-inf"))
@@ -192,8 +194,8 @@ def sparsemixer(scores, jitter_eps=0.01):
         mask_logits_threshold, max_ind = masked_scores.max(dim=-1,
                                                            keepdim=True)
         factor = scores.abs().clamp(min=mask_logits_threshold)
-        mask_logits_threshold = (
-            (mask_logits_threshold - scores) / factor) > (2 * jitter_eps)
+        mask_logits_threshold = ((mask_logits_threshold - scores) /
+                                 factor) > (2 * jitter_eps)
 
     # apply mask
     masked_gates_top2 = masked_scores.masked_fill(mask_logits_threshold,
diff --git a/vllm/model_executor/models/pixtral.py b/vllm/model_executor/models/pixtral.py
index 37b9989e489ec..003e9c84c1c0a 100644
--- a/vllm/model_executor/models/pixtral.py
+++ b/vllm/model_executor/models/pixtral.py
@@ -1,3 +1,5 @@
+# SPDX-License-Identifier: Apache-2.0
+
 import math
 from dataclasses import dataclass, fields
 from functools import cached_property
diff --git a/vllm/model_executor/models/qwen.py b/vllm/model_executor/models/qwen.py
index 1345b381f0a99..d7f6662bc9a97 100644
--- a/vllm/model_executor/models/qwen.py
+++ b/vllm/model_executor/models/qwen.py
@@ -1,29 +1,33 @@
+# SPDX-License-Identifier: Apache-2.0
+
 # Adapted from
 # https://huggingface.co/Qwen/Qwen-7B/blob/main/modeling_qwen.py
 # Copyright (c) Alibaba Cloud.
 # LICENSE: https://huggingface.co/Qwen/Qwen-7B/blob/main/LICENSE
 """Inference-only QWen model compatible with HuggingFace weights."""
 
+import copy
 import math
 import re
-from functools import partial
-from typing import (Any, Callable, Dict, Iterable, List, Literal, Mapping,
-                    Optional, Set, Tuple, TypedDict, Union)
+import unicodedata
+from functools import lru_cache, partial
+from typing import (AbstractSet, Any, Callable, Collection, Dict, Iterable,
+                    List, Literal, Mapping, Optional, Set, Tuple, TypedDict,
+                    Union)
 
-import numpy as np
 import torch
-from PIL import Image
 from torch import nn
 from torchvision import transforms
 from torchvision.transforms import InterpolationMode
-from transformers import PretrainedConfig
+from transformers import (BatchFeature, PretrainedConfig, PreTrainedTokenizer,
+                          TensorType)
+from transformers.image_utils import ImageInput
+from transformers.tokenization_utils_base import TextInput
 
 from vllm.attention import Attention, AttentionMetadata
 from vllm.compilation.decorators import support_torch_compile
 from vllm.config import CacheConfig, VllmConfig
 from vllm.distributed import get_pp_group, get_tensor_model_parallel_world_size
-from vllm.inputs import (INPUT_REGISTRY, DecoderOnlyInputs, DummyData,
-                         InputContext, token_inputs)
 from vllm.logger import init_logger
 from vllm.model_executor.layers.activation import SiluAndMul, get_act_fn
 from vllm.model_executor.layers.layernorm import RMSNorm
@@ -42,15 +46,20 @@ from vllm.model_executor.layers.vocab_parallel_embedding import (
 from vllm.model_executor.model_loader.weight_utils import default_weight_loader
 from vllm.model_executor.models.module_mapping import MultiModelKeys
 from vllm.model_executor.sampling_metadata import SamplingMetadata
-from vllm.multimodal import MULTIMODAL_REGISTRY, MultiModalKwargs
-from vllm.multimodal.utils import cached_get_tokenizer
-from vllm.sequence import IntermediateTensors, SequenceData
-from vllm.utils import is_list_of
+from vllm.multimodal import MULTIMODAL_REGISTRY
+from vllm.multimodal.inputs import (MultiModalFieldConfig, MultiModalKwargs,
+                                    NestedTensors)
+from vllm.multimodal.parse import MultiModalDataItems
+from vllm.multimodal.processing import (BaseMultiModalProcessor,
+                                        BaseProcessingInfo, PromptReplacement,
+                                        PromptReplacementDetails)
+from vllm.multimodal.profiling import BaseDummyInputsBuilder, ProcessorInputs
+from vllm.sequence import IntermediateTensors
 
 from .interfaces import SupportsLoRA, SupportsMultiModal, SupportsPP
 from .utils import (flatten_bn, is_pp_missing_parameter,
                     make_empty_intermediate_tensors_factory, make_layers,
-                    maybe_prefix)
+                    maybe_prefix, merge_multimodal_embeddings)
 
 logger = init_logger(__name__)
 
@@ -353,8 +362,10 @@ class VisionTransformer(nn.Module):
         self.ln_post = norm_layer(output_dim)
         self.proj = nn.Parameter(
             (output_dim**-0.5) * torch.randn(output_dim, output_dim))
+
         self.image_start_id = image_start_id
         self.image_end_id = image_start_id + 1
+        self.image_pad_id = image_start_id + 2
 
     def forward(self, x: torch.Tensor) -> torch.Tensor:
         x = x.to(
@@ -383,21 +394,6 @@ class VisionTransformer(nn.Module):
 
         return x
 
-    def get_image_positions(self,
-                            input_ids: torch.Tensor) -> Optional[torch.Tensor]:
-        """Given the input IDs, extracts start/stop points corresponding to
-        images.
-
-        args:
-        Returns:
-            Optional torch tensor corresponding to start/stop pairs of images.
-        """
-        if torch.any(input_ids == self.image_start_id):
-            bos_pos = torch.where(input_ids == self.image_start_id)
-            eos_pos = torch.where(input_ids == self.image_end_id)
-            return torch.stack((bos_pos[0], eos_pos[0]), dim=1)
-        return None
-
 
 class QWenMLP(nn.Module):
     """MLP for the language component of the Qwen model, which contains a
@@ -579,9 +575,12 @@ class QWenModel(nn.Module):
         self.make_empty_intermediate_tensors = (
             make_empty_intermediate_tensors_factory(
                 ["hidden_states", "residual"], config.hidden_size))
-        self.visual = VisionTransformer(**config.visual,
-                                        quant_config=quant_config) if hasattr(
-                                            config, "visual") else None
+
+        if (vision_config := getattr(config, "visual", None)):
+            self.visual = VisionTransformer(**vision_config,
+                                            quant_config=quant_config)
+        else:
+            self.visual = None
 
     def get_input_embeddings(self, input_ids: torch.Tensor) -> torch.Tensor:
         return self.wte(input_ids)
@@ -593,38 +592,13 @@ class QWenModel(nn.Module):
         kv_caches: List[torch.Tensor],
         attn_metadata: AttentionMetadata,
         intermediate_tensors: Optional[IntermediateTensors],
-        pixel_values: Optional[QwenImageInputs],
         inputs_embeds: Optional[torch.Tensor] = None,
     ) -> Union[torch.Tensor, IntermediateTensors]:
-        img_pos = None
-        # If pixel / visual embeddings are provided, this is a visual model
-        if pixel_values is not None and self.visual is not None:
-            if pixel_values["type"] != "image_embeds":
-                image_embeds = self.visual(pixel_values["data"])
-            else:
-                image_embeds = pixel_values["data"]
-
-            # features should be of shape (# images, 256, hidden_dim)
-            img_pos = self.visual.get_image_positions(input_ids)
-            if isinstance(
-                    img_pos,
-                    np.ndarray) and img_pos.shape[0] != image_embeds.shape[0]:
-                raise ValueError(
-                    f"Number of placeholders: {img_pos.shape[0]} "
-                    f"does not match number of images {image_embeds.shape[0]}."
-                )
-
         if get_pp_group().is_first_rank:
             if inputs_embeds is not None:
                 hidden_states = inputs_embeds
             else:
                 hidden_states = self.get_input_embeddings(input_ids)
-            hidden_states = self.wte(input_ids)
-            # Merge the image embeddings into the hidden states if actually have
-            # visual features and the corresponding image tokens
-            if img_pos is not None:
-                for idx, (img_bos, img_eos) in enumerate(img_pos):
-                    hidden_states[img_bos + 1:img_eos] = image_embeds[idx]
             residual = None
         else:
             assert intermediate_tensors is not None
@@ -648,159 +622,9 @@ class QWenModel(nn.Module):
         return hidden_states
 
 
-def get_image_text(image_num: int, padding: bool) -> str:
-    """Retrieves a placeholder text that when tokenized, will be expanded with
-    image pads.
-
-    Args:
-        image_num: The number of the image that we want a text prompt for.
-            Images should be indexed starting at 1.
-        padding: Whether or not padding should be manually added.
-
-    Returns:
-        Text placeholder prompt for the image being considered.
-    """
-    image_start = f"Picture {image_num}: {IMG_START}"
-    image_end = f"{IMG_END}\n"
-    if not padding:
-        return f"{image_start}{image_end}"
-    return f"{image_start}{MAX_QWEN_IMG_TOKENS * IMG_PAD}{image_end}"
-
-
-def input_processor_for_qwen(ctx: InputContext,
-                             inputs: DecoderOnlyInputs) -> DecoderOnlyInputs:
-    """Processes the inputs, which may or may not be multimodal.
-    Multimodal inputs will only be processed if the model has a "visual"
-    component in its model config, otherwise they'll be ignored.
-
-    Args:
-        ctx: Context of the loaded model.
-        inputs: LLM inputs which may have a multi_modal_data attribute.
-
-    Returns:
-        If the model is language only or not multimodal inputs were provided,
-        returns inputs unmodified. Otherwise, processes the multimodal
-        images / image embeddings and adds the fixed-length image placeholders.
-    """
-    multi_modal_data = inputs.get("multi_modal_data")
-
-    # Only process images if we have multimodal data and a visual config
-    hf_config = ctx.get_hf_config()
-    if (multi_modal_data is None or "image" not in multi_modal_data
-            or not hasattr(hf_config, "visual")):
-        return inputs
-
-    prompt = inputs.get("prompt")
-    prompt_token_ids = inputs["prompt_token_ids"]
-    model_config = ctx.model_config
-    tokenizer = cached_get_tokenizer(
-        model_config.tokenizer,
-        trust_remote_code=model_config.trust_remote_code)
-    image_data = multi_modal_data["image"]
-    if isinstance(image_data, torch.Tensor):
-        num_dims = len(image_data.shape)
-        if num_dims < 2 or num_dims > 3:
-            raise ValueError(
-                f"Expected img embeds to be have 3 dimensions, got {num_dims}")
-        num_images = 1 if num_dims == 2 else image_data.shape[0]
-    elif isinstance(image_data, Image.Image):
-        num_images = 1
-    elif is_list_of(image_data, Image.Image):
-        num_images = len(image_data)
-    else:
-        raise TypeError(f"Invalid image type: {type(image_data)}")
-
-    if prompt is None:
-        prompt = tokenizer.decode(prompt_token_ids)
-
-    # Drops anything between <img>/</img> tags; encoding with the tokenizer
-    # will automatically add the image pads for the context.
-    new_prompt, num_matched_images = re.subn(
-        r"(Picture \d*: <img>).*?(<\/img>\n)",
-        r"\1\2",
-        prompt,
-    )
-
-    if num_matched_images != num_images:
-        logger.warning(
-            "Number of matched image placeholders %s doesn't match the number "
-            "of expected images %s; check your placeholder formatting.",
-            num_matched_images, num_images)
-
-    new_prompt_token_ids = tokenizer.encode(new_prompt)
-
-    return token_inputs(prompt=new_prompt,
-                        prompt_token_ids=new_prompt_token_ids,
-                        multi_modal_data=multi_modal_data)
-
-
-def input_mapper_for_qwen(ctx: InputContext, data: object) -> MultiModalKwargs:
-    """Maps the input data to its MultiModalKwargs (if any).
-
-    Args:
-        ctx: Context of the loaded model.
-        data: data potentially containing image/image embeddings to be mapped
-            to pixel_values in .forward() for a visual QWenLMHeadModel model.
-
-    Returns:
-        MultiModalKwargs containing the stacked normalized images tensor or
-        image embeddings.
-    """
-    # Early exit if we have provided an image to a language only Qwen model
-    hf_config = ctx.get_hf_config()
-    if not hasattr(hf_config, "visual"):
-        logger.warning(
-            "Images were provided but this model has no visual config; "
-            "multimodal inputs will not be forwarded to the model.")
-        return MultiModalKwargs()
-
-    model_config = ctx.model_config
-    tokenizer = cached_get_tokenizer(
-        model_config.tokenizer,
-        trust_remote_code=model_config.trust_remote_code)
-
-    image_pair_tok = tokenizer.encode(IMG_START + IMG_END,
-                                      add_special_tokens=False,
-                                      return_tensors="pt").squeeze()
-    image_start_id = image_pair_tok[0]
-    image_end_id = image_pair_tok[-1]
-    if (image_start_id + 1) != image_end_id:
-        raise ValueError(
-            f"Found image end ID {image_end_id}, but expected {IMG_START} + 1")
-    if len(image_pair_tok) != (MAX_QWEN_IMG_TOKENS + 2):
-        raise ValueError(
-            f"Expected image context length of {MAX_QWEN_IMG_TOKENS}, "
-            f"but got {image_pair_tok - 2}")
-
-    hf_config = ctx.get_hf_config()
-    image_size = hf_config.visual["image_size"]
-    img_emb_size = hf_config.visual["output_dim"]
-
-    if isinstance(data, torch.Tensor):
-        # It's expected that our values have already been processed
-        # by the visual transformer; shape is expected to be:
-        # (# images, 256, hidden_size)
-        if len(data.shape) == 2:
-            # Assume only one image embed was provided; unsqueeze the extra dim
-            data = data.unsqueeze(0)
-        if len(data.shape) != 3 or data.shape[
-                1] != MAX_QWEN_IMG_TOKENS or data.shape[2] != img_emb_size:
-            raise ValueError(
-                "Expected image embeds to be a tensor of shape"
-                f"[# images, {MAX_QWEN_IMG_TOKENS}, {img_emb_size}], but "
-                f"received shape [{data.shape}]")
-        pixel_values = data
-    else:
-        transform = build_normalization_transform(image_size)
-        if not isinstance(data, (list, tuple)):
-            data = [data]
-        transformed_images = [transform(datum) for datum in data]
-        pixel_values = torch.stack(transformed_images, dim=0)
-    return MultiModalKwargs({"pixel_values": pixel_values})
-
-
 def build_normalization_transform(image_size: int) -> transforms.Compose:
-    """Builds a normalization transform which can be applied to one or
+    """
+    Build a normalization transform which can be applied to one or
     more input images from which we want to extract visual features.
 
     Args:
@@ -817,62 +641,251 @@ def build_normalization_transform(image_size: int) -> transforms.Compose:
     ])
 
 
-def dummy_data_for_qwen(
-    ctx: InputContext,
-    seq_len: int,
-    mm_counts: Mapping[str, int],
-) -> DummyData:
-    """Build dummy data for warming up Qwen models; this will only contain text
-    matching the defaults for VLLM unless the model has a visual config.
-
-    Args:
-        ctx: Context of the loaded model.
-        seq_len: Number of tokens in the text sequence.
-        mm_counts: multimodal data counts.
-    
-    Returns:
-        Tuple containing sequential and multimodal data.
+@lru_cache(maxsize=1)
+def _get_tokenizer_without_image_pad(
+        tokenizer: PreTrainedTokenizer) -> PreTrainedTokenizer:
     """
-    hf_config = ctx.get_hf_config()
+    The logic of adding image pad tokens should only be applied in
+    :class:`QWenVLProcessor`, so they are patched out here.
 
-    # The presence of a visual config indicates this is a multimodal model.
-    # If we don't have it, the model is considered an LLM for warmup purposes.
-    if not hasattr(hf_config, "visual"):
-        seq_data = SequenceData.from_prompt_token_counts((0, seq_len))
-        mm_data = None
-        return DummyData(seq_data, mm_data)
+    The definition of the wrapped tokenizer can be found here:
+    https://huggingface.co/Qwen/Qwen-VL/blob/main/tokenization_qwen.py
+    """
+    new_tokenizer = copy.deepcopy(tokenizer)
 
-    # We have a visual component - use images to warm up
-    num_images = mm_counts["image"]
-    model_config = ctx.model_config
-    tokenizer = cached_get_tokenizer(
-        model_config.tokenizer,
-        trust_remote_code=model_config.trust_remote_code)
+    class TokenizerWithoutImagePad(tokenizer.__class__):  # type: ignore
 
-    # Build the image prompts with no imgpads; the tokenizer will add img pads
-    image_prompt = ''.join(
-        [get_image_text(idx, False) for idx in range(1, num_images + 1)])
-    toks = tokenizer.encode(image_prompt, add_special_tokens=False)
+        def tokenize(
+            self,
+            text: str,
+            allowed_special: Union[AbstractSet[str], str] = "all",
+            disallowed_special: Union[Collection[str], str] = (),
+            **kwargs,
+        ) -> list[Union[bytes, str]]:
+            text = unicodedata.normalize("NFC", text)
 
-    # Make sure we actually get the fixed context size per tok padding
-    num_pads = toks.count(tokenizer.encode(IMG_PAD)[0])
-    if num_pads != (num_images * MAX_QWEN_IMG_TOKENS):
-        raise ValueError(
-            f"Tokenized dummy data should encode {MAX_QWEN_IMG_TOKENS} pads"
-            f" per image, but got {num_pads} pads for {num_images} image(s)"
-            " in total. Are you using a qwen tokenizer?")
+            return [
+                self.decoder[t] for t in self.tokenizer.encode(
+                    text,
+                    allowed_special=allowed_special,
+                    disallowed_special=disallowed_special,
+                )
+            ]
 
-    # Ensure the number of tokens is at minimum the sequence length provided
-    if len(toks) < seq_len:
-        toks += [0] * (seq_len - len(toks))
+        def _decode(
+            self,
+            token_ids: Union[int, List[int]],
+            skip_special_tokens: bool = False,
+            errors: Optional[str] = None,
+            **kwargs,
+        ) -> str:
+            if isinstance(token_ids, int):
+                token_ids = [token_ids]
 
-    seq_data = SequenceData.from_seqs(toks)
+            return self.tokenizer.decode(
+                token_ids,
+                errors=errors or self.errors,
+            )
 
-    # Build the input images; width/height doesn't actually matter here since
-    # the data will get resized and the # of tokens per image is constant
-    image = Image.new("RGB", (224, 224), color=0)
-    mm_data = {"image": image if num_images == 1 else [image] * num_images}
-    return DummyData(seq_data, mm_data)
+    TokenizerWithoutImagePad.__name__ = \
+        f"{tokenizer.__class__.__name__}WithoutImagePad"
+
+    new_tokenizer.__class__ = TokenizerWithoutImagePad
+    return new_tokenizer
+
+
+class QWenVLProcessor:
+    """
+    This model doesn't define its own HF processor,
+    so we implement our own one here.
+
+    We call the wrapped tokenizer to automatically insert image pad tokens:
+    https://huggingface.co/Qwen/Qwen-VL/blob/main/tokenization_qwen.py#L245
+
+    The image processor is defined here:
+    https://huggingface.co/Qwen/Qwen-VL/blob/main/visual.py#L354
+    """
+
+    def __init__(
+        self,
+        config: PretrainedConfig,
+        tokenizer: PreTrainedTokenizer,
+    ) -> None:
+        super().__init__()
+
+        self.config = config
+        self.tokenizer = tokenizer
+
+        if hasattr(self.config, "visual"):
+            self.image_transform = build_normalization_transform(
+                config.visual["image_size"])
+        else:
+            self.image_transform = None
+
+        special_tokens: dict[str,
+                             int] = tokenizer.special_tokens  # type: ignore
+        self.img_start_id = special_tokens[IMG_START]
+        self.img_end_id = special_tokens[IMG_END]
+
+    def __call__(
+        self,
+        text: Optional[Union[TextInput, list[TextInput]]] = None,
+        images: Optional[Union[ImageInput, list[ImageInput]]] = None,
+        return_tensors: Optional[Union[str, TensorType]] = None,
+    ) -> BatchFeature:
+        if text is None:
+            text = []
+        if not isinstance(text, list):
+            text = [text]
+        if images is None:
+            images = []
+        if not isinstance(images, list):
+            images = [images]
+
+        text_inputs = self.tokenizer(text)
+
+        if len(images) == 0:
+            image_inputs = {}
+        else:
+            if self.image_transform is None:
+                raise ValueError("This model does not support image inputs")
+
+            pixel_values = [self.image_transform(image) for image in images]
+            image_inputs = {"pixel_values": torch.stack(pixel_values)}
+
+        return BatchFeature(
+            {
+                **text_inputs,
+                **image_inputs,
+            },
+            tensor_type=return_tensors,
+        )
+
+
+class QWenVLProcessingInfo(BaseProcessingInfo):
+
+    def get_tokenizer(self) -> PreTrainedTokenizer:
+        tokenizer = self.ctx.tokenizer
+        assert isinstance(tokenizer, PreTrainedTokenizer)
+
+        return _get_tokenizer_without_image_pad(tokenizer)
+
+    def get_hf_processor(self) -> QWenVLProcessor:
+        tokenizer = self.ctx.tokenizer
+        assert isinstance(tokenizer, PreTrainedTokenizer)
+
+        return QWenVLProcessor(self.get_hf_config(), tokenizer)
+
+    def get_supported_mm_limits(self) -> Mapping[str, Optional[int]]:
+        return {"image": None}
+
+    def get_mm_max_tokens_per_item(self, seq_len: int) -> Mapping[str, int]:
+        return {"image": self.get_num_image_tokens()}
+
+    def get_num_image_tokens(self) -> int:
+        return MAX_QWEN_IMG_TOKENS
+
+
+class QWenVLDummyInputsBuilder(BaseDummyInputsBuilder[QWenVLProcessingInfo]):
+
+    def get_dummy_processor_inputs(
+        self,
+        seq_len: int,
+        mm_counts: Mapping[str, int],
+    ) -> ProcessorInputs:
+        hf_config = self.info.get_hf_config()
+        if not hasattr(hf_config, "visual"):
+            return ProcessorInputs(prompt_text="", mm_data={})
+
+        vision_config = hf_config.visual
+
+        max_image_size = vision_config["image_size"]
+        num_images = mm_counts.get("image", 0)
+
+        mm_data = {
+            "image":
+            self._get_dummy_images(width=max_image_size,
+                                   height=max_image_size,
+                                   num_images=num_images)
+        }
+
+        return ProcessorInputs(
+            prompt_text="".join(f"Picture {i}: {IMG_START}{IMG_END}\n"
+                                for i in range(1, num_images + 1)),
+            mm_data=mm_data,
+        )
+
+
+class QWenVLMultiModalProcessor(BaseMultiModalProcessor[QWenVLProcessingInfo]):
+
+    def _call_hf_processor(
+        self,
+        prompt: str,
+        mm_data: Mapping[str, object],
+        mm_kwargs: Mapping[str, object],
+    ) -> BatchFeature:
+        # Drops anything between <img>/</img> tags; encoding with the tokenizer
+        # will automatically add the image pads for the context.
+        prompt, num_matched_images = re.subn(
+            r"(Picture \d*: <img>).*?(<\/img>\n)",
+            r"\1\2",
+            prompt,
+        )
+
+        image_data = mm_data.get("images")
+        if image_data is not None:
+            assert isinstance(image_data, list)
+
+            num_images = len(image_data)
+            if num_matched_images != num_images:
+                logger.warning(
+                    "Number of matched image placeholders %s doesn't match "
+                    "the number of expected images %s; check your placeholder "
+                    "formatting.", num_matched_images, num_images)
+
+        return super()._call_hf_processor(
+            prompt=prompt,
+            mm_data=mm_data,
+            mm_kwargs=mm_kwargs,
+        )
+
+    def _get_mm_fields_config(
+        self,
+        hf_inputs: BatchFeature,
+        hf_processor_mm_kwargs: Mapping[str, object],
+    ) -> Mapping[str, MultiModalFieldConfig]:
+        return dict(
+            pixel_values=MultiModalFieldConfig.batched("image"),
+            image_embeds=MultiModalFieldConfig.batched("image"),
+        )
+
+    def _get_prompt_replacements(
+        self,
+        mm_items: MultiModalDataItems,
+        hf_processor_mm_kwargs: Mapping[str, object],
+        out_mm_kwargs: MultiModalKwargs,
+    ) -> list[PromptReplacement]:
+        tokenizer = self.info.get_tokenizer()
+        special_tokens: dict[str,
+                             int] = tokenizer.special_tokens  # type: ignore
+
+        img_start_id = special_tokens[IMG_START]
+        img_end_id = special_tokens[IMG_END]
+        img_pad_id = special_tokens[IMG_PAD]
+
+        num_image_tokens = self.info.get_num_image_tokens()
+        image_tokens = [img_pad_id] * num_image_tokens
+
+        return [
+            PromptReplacement(
+                modality="image",
+                target=[img_start_id, img_end_id],
+                replacement=PromptReplacementDetails(
+                    full=[img_start_id] + image_tokens + [img_end_id],
+                    features=image_tokens,
+                ),
+            )
+        ]
 
 
 class QWenBaseModel(nn.Module, SupportsPP, SupportsLoRA):
@@ -898,38 +911,77 @@ class QWenBaseModel(nn.Module, SupportsPP, SupportsLoRA):
         self.make_empty_intermediate_tensors = (
             self.transformer.make_empty_intermediate_tensors)
 
-    def _get_image_input_type(
-            self,
-            pixel_values: Optional[torch.Tensor]) -> Optional[QwenImageInputs]:
-        """Determines if the provided pixel_values are normalized pixel values
-        or image embeddings.
+    def _validate_pixel_values(self, data: torch.Tensor) -> torch.Tensor:
+        h = w = self.config.visual["image_size"]
+        expected_dims = (3, h, w)
+        actual_dims = tuple(data.shape[1:])
 
-        Args:
-            pixel_values: Optional data to processed into visual embeddings.
+        if actual_dims != expected_dims:
+            expected_expr = ("batch_size", *map(str, expected_dims))
+            raise ValueError(
+                f"The expected shape of pixel values is {expected_expr}. "
+                f"You supplied {tuple(data.shape)}.")
+
+        return data
+
+    def _parse_and_validate_image_input(
+            self, **kwargs: object) -> Optional[QwenImageInputs]:
+        pixel_values = kwargs.pop("pixel_values", None)
+        image_embeds = kwargs.pop("image_embeds", None)
+
+        if pixel_values is not None:
+            if not isinstance(pixel_values, torch.Tensor):
+                raise ValueError("Incorrect type of pixel values. "
+                                 f"Got type: {type(pixel_values)}")
+
+            return QwenImagePixelInputs(
+                type="pixel_values",
+                data=self._validate_pixel_values(
+                    flatten_bn(pixel_values, concat=True)),
+            )
+
+        if image_embeds is not None:
+            if not isinstance(image_embeds, torch.Tensor):
+                raise ValueError("Incorrect type of image embeddings. "
+                                 f"Got type: {type(image_embeds)}")
+
+            return QwenImageEmbeddingInputs(
+                type="image_embeds",
+                data=flatten_bn(image_embeds),
+            )
 
-        Returns:
-            None of the QwenImageInputs type used to determine whether or not
-            the visual transformer needs to process the pixel_values.
-        """
-        if pixel_values is not None and self.transformer.visual is not None:
-            pixel_values = flatten_bn(pixel_values)
-            if len(pixel_values.shape) == 3 and pixel_values.shape[
-                    1] == MAX_QWEN_IMG_TOKENS and pixel_values.shape[
-                        2] == self.config.visual["output_dim"]:
-                return QwenImageEmbeddingInputs(
-                    type="image_embeds",
-                    data=pixel_values,
-                )
-            else:
-                # If we have the wrong shape, assume we still need to process
-                return QwenImagePixelInputs(
-                    type="pixel_values",
-                    data=pixel_values,
-                )
         return None
 
-    def get_input_embeddings(self, input_ids: torch.Tensor) -> torch.Tensor:
-        return self.transformer.get_input_embeddings(input_ids)
+    def _process_image_input(self,
+                             image_input: QwenImageInputs) -> torch.Tensor:
+        if image_input["type"] == "image_embeds":
+            return image_input["data"]
+
+        assert self.transformer.visual is not None
+        return self.transformer.visual(image_input["data"])
+
+    def get_multimodal_embeddings(self, **kwargs) -> Optional[NestedTensors]:
+        image_input = self._parse_and_validate_image_input(**kwargs)
+        if image_input is None:
+            return None
+
+        vision_embeddings = self._process_image_input(image_input)
+        return vision_embeddings
+
+    def get_input_embeddings(
+        self,
+        input_ids: torch.Tensor,
+        multimodal_embeddings: Optional[NestedTensors] = None,
+    ) -> torch.Tensor:
+        inputs_embeds = self.transformer.get_input_embeddings(input_ids)
+
+        if multimodal_embeddings is not None:
+            assert self.transformer.visual is not None
+            inputs_embeds = merge_multimodal_embeddings(
+                input_ids, inputs_embeds, multimodal_embeddings,
+                self.transformer.visual.image_pad_id)
+
+        return inputs_embeds
 
     def forward(
         self,
@@ -938,18 +990,23 @@ class QWenBaseModel(nn.Module, SupportsPP, SupportsLoRA):
         kv_caches: List[torch.Tensor],
         attn_metadata: AttentionMetadata,
         intermediate_tensors: Optional[IntermediateTensors] = None,
-        pixel_values: Optional[torch.Tensor] = None,
         inputs_embeds: Optional[torch.Tensor] = None,
+        **kwargs: object,
     ) -> Union[torch.Tensor, IntermediateTensors]:
         if intermediate_tensors is not None:
+            inputs_embeds = None
+
+        # NOTE: In v1, inputs_embeds is always generated at model runner, this
+        # condition is for v0 compatibility.
+        elif inputs_embeds is None:
+            vision_embeddings = self.get_multimodal_embeddings(**kwargs)
+            inputs_embeds = self.get_input_embeddings(input_ids,
+                                                      vision_embeddings)
             input_ids = None
-            pixel_values = None
-        else:
-            pixel_values = self._get_image_input_type(pixel_values)
 
         hidden_states = self.transformer(input_ids, positions, kv_caches,
                                          attn_metadata, intermediate_tensors,
-                                         pixel_values, inputs_embeds)
+                                         inputs_embeds)
         return hidden_states
 
     def compute_logits(
@@ -1063,10 +1120,9 @@ class QWenVL(QWenBaseModel, SupportsMultiModal):
             tower_model="transformer.visual.transformer")
 
 
-@MULTIMODAL_REGISTRY.register_image_input_mapper(input_mapper_for_qwen)
-@MULTIMODAL_REGISTRY.register_max_image_tokens(MAX_QWEN_IMG_TOKENS)
-@INPUT_REGISTRY.register_dummy_data(dummy_data_for_qwen)
-@INPUT_REGISTRY.register_input_processor(input_processor_for_qwen)
+@MULTIMODAL_REGISTRY.register_processor(QWenVLMultiModalProcessor,
+                                        info=QWenVLProcessingInfo,
+                                        dummy_inputs=QWenVLDummyInputsBuilder)
 class QWenLMHeadModel(QWenBaseModel, SupportsMultiModal, SupportsLoRA):
     """
     QWenLMHeadModel is not only applicable to LLM  but also to VL, which is not 
@@ -1084,7 +1140,7 @@ class QWenLMHeadModel(QWenBaseModel, SupportsMultiModal, SupportsLoRA):
         cls,
         vllm_config: VllmConfig,
         prefix: str = "",
-    ) -> None:
+    ) -> QWenBaseModel:
         config = vllm_config.model_config.hf_config
         # Initialize VL
         if hasattr(config, "visual"):
diff --git a/vllm/model_executor/models/qwen2.py b/vllm/model_executor/models/qwen2.py
index 82de1c3574090..e3de6b64fbb39 100644
--- a/vllm/model_executor/models/qwen2.py
+++ b/vllm/model_executor/models/qwen2.py
@@ -1,3 +1,5 @@
+# SPDX-License-Identifier: Apache-2.0
+
 # Adapted from
 # https://github.com/huggingface/transformers/blob/v4.28.0/src/transformers/models/qwen2/modeling_qwen2.py
 # Copyright 2024 The Qwen team.
diff --git a/vllm/model_executor/models/qwen2_audio.py b/vllm/model_executor/models/qwen2_audio.py
index fc5aed5c94abb..cf104ab008722 100644
--- a/vllm/model_executor/models/qwen2_audio.py
+++ b/vllm/model_executor/models/qwen2_audio.py
@@ -1,3 +1,5 @@
+# SPDX-License-Identifier: Apache-2.0
+
 # Copyright 2024 The Qwen team.
 # Copyright 2023 The vLLM team.
 # Copyright 2022 EleutherAI and the HuggingFace Inc. team. All rights reserved.
diff --git a/vllm/model_executor/models/qwen2_moe.py b/vllm/model_executor/models/qwen2_moe.py
index 95de6c21871bf..35d9854a55d6b 100644
--- a/vllm/model_executor/models/qwen2_moe.py
+++ b/vllm/model_executor/models/qwen2_moe.py
@@ -1,3 +1,5 @@
+# SPDX-License-Identifier: Apache-2.0
+
 # Adapted from
 # https://github.com/huggingface/transformers/blob/v4.28.0/src/transformers/models/qwen2_moe/modeling_qwen2_moe.py
 # Copyright 2024 The Qwen team.
diff --git a/vllm/model_executor/models/qwen2_rm.py b/vllm/model_executor/models/qwen2_rm.py
index 593ce4857af0f..00e4159e28cf7 100644
--- a/vllm/model_executor/models/qwen2_rm.py
+++ b/vllm/model_executor/models/qwen2_rm.py
@@ -1,3 +1,5 @@
+# SPDX-License-Identifier: Apache-2.0
+
 # Adapted from
 # https://huggingface.co/Qwen/Qwen2.5-Math-RM-72B/blob/main/modeling_qwen2_rm.py
 # Copyright 2024 The Qwen team.
diff --git a/vllm/model_executor/models/qwen2_vl.py b/vllm/model_executor/models/qwen2_vl.py
index a2778ee73810e..189ac41e8a6c1 100644
--- a/vllm/model_executor/models/qwen2_vl.py
+++ b/vllm/model_executor/models/qwen2_vl.py
@@ -1,3 +1,5 @@
+# SPDX-License-Identifier: Apache-2.0
+
 # Adapted from
 # https://github.com/huggingface/transformers/blob/19e6e80e10118f855137b90740936c0b11ac397f/src/transformers/models/qwen2_vl/modeling_qwen2_vl.py
 # Copyright 2024 The Qwen team.
diff --git a/vllm/model_executor/models/registry.py b/vllm/model_executor/models/registry.py
index 8d2719ca2d00d..962f95f10fc51 100644
--- a/vllm/model_executor/models/registry.py
+++ b/vllm/model_executor/models/registry.py
@@ -1,3 +1,4 @@
+# SPDX-License-Identifier: Apache-2.0
 """
 Whenever you add an architecture to this page, please also update
 `tests/models/registry.py` with example HuggingFace models for it.
@@ -162,6 +163,7 @@ _MULTIMODAL_MODELS = {
     "LlavaNextVideoForConditionalGeneration": ("llava_next_video", "LlavaNextVideoForConditionalGeneration"),  # noqa: E501
     "LlavaOnevisionForConditionalGeneration": ("llava_onevision", "LlavaOnevisionForConditionalGeneration"),  # noqa: E501
     "MantisForConditionalGeneration": ("llava", "MantisForConditionalGeneration"),  # noqa: E501
+    "MiniCPMO": ("minicpmo", "MiniCPMO"),
     "MiniCPMV": ("minicpmv", "MiniCPMV"),
     "MolmoForCausalLM": ("molmo", "MolmoForCausalLM"),
     "NVLM_D": ("nvlm_d", "NVLM_D_Model"),
@@ -182,6 +184,10 @@ _SPECULATIVE_DECODING_MODELS = {
     "MedusaModel": ("medusa", "Medusa"),
     "MLPSpeculatorPreTrainedModel": ("mlp_speculator", "MLPSpeculator"),
 }
+
+_FALLBACK_MODEL = {
+    "TransformersModel": ("transformers", "TransformersModel"),
+}
 # yapf: enable
 
 _VLLM_MODELS = {
@@ -190,6 +196,7 @@ _VLLM_MODELS = {
     **_CROSS_ENCODER_MODELS,
     **_MULTIMODAL_MODELS,
     **_SPECULATIVE_DECODING_MODELS,
+    **_FALLBACK_MODEL,
 }
 
 
@@ -376,7 +383,12 @@ class _ModelRegistry:
         if not architectures:
             logger.warning("No model architectures are specified")
 
-        return architectures
+        normalized_arch = []
+        for model in architectures:
+            if model not in self.models:
+                model = "TransformersModel"
+            normalized_arch.append(model)
+        return normalized_arch
 
     def inspect_model_cls(
         self,
@@ -462,7 +474,8 @@ class _ModelRegistry:
 
 
 ModelRegistry = _ModelRegistry({
-    model_arch: _LazyRegisteredModel(
+    model_arch:
+    _LazyRegisteredModel(
         module_name=f"vllm.model_executor.models.{mod_relname}",
         class_name=cls_name,
     )
diff --git a/vllm/model_executor/models/roberta.py b/vllm/model_executor/models/roberta.py
index 5997a76890c9d..742e63a065b18 100644
--- a/vllm/model_executor/models/roberta.py
+++ b/vllm/model_executor/models/roberta.py
@@ -1,3 +1,5 @@
+# SPDX-License-Identifier: Apache-2.0
+
 import itertools
 from typing import Iterable, List, Optional, Tuple
 
diff --git a/vllm/model_executor/models/siglip.py b/vllm/model_executor/models/siglip.py
index 1e51018973e8c..a81462f6fbf4b 100644
--- a/vllm/model_executor/models/siglip.py
+++ b/vllm/model_executor/models/siglip.py
@@ -1,3 +1,4 @@
+# SPDX-License-Identifier: Apache-2.0
 """Implementation of SiglipVisionModel intended to be only used
 within a vision language model."""
 
diff --git a/vllm/model_executor/models/solar.py b/vllm/model_executor/models/solar.py
index e6d919f23c85d..6215ed814bf42 100644
--- a/vllm/model_executor/models/solar.py
+++ b/vllm/model_executor/models/solar.py
@@ -1,3 +1,5 @@
+# SPDX-License-Identifier: Apache-2.0
+
 # Adapted from
 # https://github.com/huggingface/transformers/blob/v4.28.0/src/transformers/models/llama/modeling_llama.py
 # Copyright 2023 The vLLM team.
diff --git a/vllm/model_executor/models/stablelm.py b/vllm/model_executor/models/stablelm.py
index c9d1af78246a6..a5d4432669f4c 100644
--- a/vllm/model_executor/models/stablelm.py
+++ b/vllm/model_executor/models/stablelm.py
@@ -1,3 +1,5 @@
+# SPDX-License-Identifier: Apache-2.0
+
 # Copyright 2023 Stability AI, EleutherAI, and The HuggingFace Inc. team.
 # All rights reserved.
 #
diff --git a/vllm/model_executor/models/starcoder2.py b/vllm/model_executor/models/starcoder2.py
index 1cd0dedfed2cb..01ea43666482a 100644
--- a/vllm/model_executor/models/starcoder2.py
+++ b/vllm/model_executor/models/starcoder2.py
@@ -1,3 +1,5 @@
+# SPDX-License-Identifier: Apache-2.0
+
 # Copyright 2024 BigCode and the HuggingFace Inc. team. All rights reserved.
 #
 # This code is based on EleutherAI's GPT-NeoX library and the GPT-NeoX
diff --git a/vllm/model_executor/models/telechat2.py b/vllm/model_executor/models/telechat2.py
index 02ca7fe08e556..a38035e37ec73 100644
--- a/vllm/model_executor/models/telechat2.py
+++ b/vllm/model_executor/models/telechat2.py
@@ -1,3 +1,5 @@
+# SPDX-License-Identifier: Apache-2.0
+
 # Copyright 2023 The vLLM team.
 # Copyright 2022 EleutherAI and the HuggingFace Inc. team. All rights reserved.
 #
diff --git a/vllm/model_executor/models/transformers.py b/vllm/model_executor/models/transformers.py
new file mode 100644
index 0000000000000..ff1ae0ac85bac
--- /dev/null
+++ b/vllm/model_executor/models/transformers.py
@@ -0,0 +1,264 @@
+# Copyright 2024 The vLLM team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Wrapper around `transformers` models"""
+import re
+from typing import Iterable, List, Optional, Set, Tuple, Union
+
+import torch
+from torch import nn
+from transformers import AutoModel, PreTrainedModel
+from transformers.modeling_utils import ALL_ATTENTION_FUNCTIONS
+
+from vllm.attention import Attention, AttentionMetadata
+from vllm.config import VllmConfig
+from vllm.distributed import get_tensor_model_parallel_world_size
+from vllm.distributed.utils import divide
+from vllm.logger import init_logger
+from vllm.model_executor.layers.linear import (ColumnParallelLinear,
+                                               RowParallelLinear)
+from vllm.model_executor.layers.logits_processor import LogitsProcessor
+from vllm.model_executor.layers.sampler import SamplerOutput, get_sampler
+from vllm.model_executor.layers.vocab_parallel_embedding import (
+    ParallelLMHead, VocabParallelEmbedding)
+from vllm.model_executor.model_loader.weight_utils import default_weight_loader
+from vllm.model_executor.sampling_metadata import SamplingMetadata
+from vllm.sequence import IntermediateTensors
+
+from .utils import maybe_prefix
+
+logger = init_logger(__name__)
+
+
+def vllm_flash_attention_forward(
+        # Transformers args
+        module: torch.nn.Module,
+        query: torch.Tensor,
+        key: torch.Tensor,
+        value: torch.Tensor,
+        attention_mask: torch.Tensor,
+        # Transformers kwargs
+        scaling: float = None,
+        # vLLM kwargs
+        attn_metadata: AttentionMetadata = None,
+        attention_instances: list[Attention] = None,
+        **kwargs):
+    self_attn = attention_instances[module.layer_idx]
+    if scaling is not None:
+        self_attn.impl.scale = float(scaling)
+    hidden = query.shape[-2]
+    query, key, value = (x.transpose(1, 2) for x in (query, key, value))
+    query, key, value = (x.reshape(hidden, -1) for x in (query, key, value))
+    return self_attn.forward(
+        query,
+        key,
+        value,
+        kv_cache=None,  # argument not used
+        attn_metadata=attn_metadata), None
+
+
+ALL_ATTENTION_FUNCTIONS["vllm"] = vllm_flash_attention_forward
+
+
+# Linear Layer that is compatible with transformers internal forward
+# TODO: This is a temporary solution, we should find a better way to integrate
+class HFColumnParallelLinear(ColumnParallelLinear):
+
+    def forward(self, input: torch.Tensor) -> torch.Tensor:
+        return super().forward(input)[0]
+
+
+class HFRowParallelLinear(RowParallelLinear):
+
+    def forward(self, input: torch.Tensor) -> torch.Tensor:
+        return super().forward(input)[0]
+
+
+def replace_tp_linear_class(orig_module: nn.Linear,
+                            style: str,
+                            quant_config=None):
+    """
+    In model configurations, we use a neutral type (string) to specify parallel
+    styles, here we use it to translate nn.Linear into vllm-style tp Linear.
+
+    Quant config is not supported yet
+    """
+
+    if not isinstance(style, str):
+        raise ValueError(
+            f"Unsupported parallel style type {type(style)}, expected str")
+
+    input_size = orig_module.in_features
+    output_size = orig_module.out_features
+    bias = orig_module.bias is not None
+
+    if style == "colwise":
+        return HFColumnParallelLinear(
+            input_size,
+            output_size,
+            bias,
+        )
+    elif style == "rowwise":
+        return HFRowParallelLinear(
+            input_size,
+            output_size,
+            bias,
+        )
+    # We don't consider colwise_rep since it's used in lm_head
+    else:
+        raise ValueError(f"Unsupported parallel style value: {style}")
+
+
+class TransformersModel(nn.Module):
+    embedding_padding_modules = ["lm_head"]
+    embedding_modules = ["embed_tokens"
+                         ]  # TODO transformers will have a util to get it
+
+    def __init__(self, *, vllm_config: VllmConfig, prefix: str = "") -> None:
+        super().__init__()
+        logger.info("Using Transformers backend.")
+
+        self.vllm_config = vllm_config
+        config = vllm_config.model_config.hf_config
+        cache_config = vllm_config.cache_config
+        quant_config = vllm_config.quant_config
+        self.quant_config = quant_config
+        self.config = config
+        self.vocab_size = config.vocab_size
+        self.unpadded_vocab_size = config.vocab_size
+
+        self.model: PreTrainedModel = AutoModel.from_config(
+            self.config,
+            attn_implementation="vllm",
+            torch_dtype=vllm_config.model_config.dtype,
+            trust_remote_code=vllm_config.model_config.trust_remote_code,
+        )
+        prefix = self.model.base_model_prefix
+
+        # MLP modifications
+        self.tensor_parallelize(self.model)
+
+        # Attention modifications (assumes 1 attention op per hidden layer)
+        tp_size = get_tensor_model_parallel_world_size()
+        self.attention_instances = [
+            Attention(
+                num_heads=divide(config.num_attention_heads, tp_size),
+                head_size=config.head_dim,
+                # NOTE: We use Llama scale as default, if it's set by
+                # Transformers, it's updated in vllm_flash_attention_forward
+                scale=config.head_dim**-0.5,
+                num_kv_heads=divide(config.num_key_value_heads, tp_size),
+                cache_config=cache_config,
+                quant_config=None,
+                prefix=f"{i}.attn") for i in range(config.num_hidden_layers)
+        ]
+
+        # Model modifications
+        self.replace_vocab_embed_class(self.model)
+
+        # ForCausalLM modifications
+        self.lm_head = ParallelLMHead(config.vocab_size,
+                                      config.hidden_size,
+                                      quant_config=None,
+                                      prefix=maybe_prefix(prefix, "lm_head"))
+        if config.tie_word_embeddings:
+            self.lm_head.weight = self.model.get_input_embeddings().weight
+
+        logit_scale = getattr(config, "logit_scale", 1.0)
+        self.logits_processor = LogitsProcessor(self.unpadded_vocab_size,
+                                                config.vocab_size, logit_scale)
+        self.sampler = get_sampler()
+
+    def log_replacement(self, name: str, old_module: nn.Module,
+                        new_module: nn.Module):
+        logger.debug("%s: %s -> %s", name, old_module, new_module)
+
+    def tensor_parallelize(self, module: nn.Module, prefix: str = ""):
+        if (self.config.base_model_tp_plan is None
+                and self.vllm_config.parallel_config.tensor_parallel_size > 1):
+            raise ValueError(
+                "Trying to run tensor parallelization but the model does not "
+                "support it yet!")
+
+        for child_name, child_module in module.named_children():
+            qual_name = prefix + child_name
+            for pattern, style in self.config.base_model_tp_plan.items():
+                if re.match(pattern, qual_name) and isinstance(
+                        child_module, nn.Linear):
+                    new_module = replace_tp_linear_class(
+                        child_module, style, self.quant_config)
+                    setattr(module, child_name, new_module)
+                    self.log_replacement(qual_name, child_module, new_module)
+            else:
+                self.tensor_parallelize(child_module, prefix=f"{qual_name}.")
+
+    def replace_vocab_embed_class(self, module: nn.Module):
+        # Use native set input embeddings
+        new_module = VocabParallelEmbedding(
+            self.vocab_size,
+            self.config.hidden_size,
+            org_num_embeddings=self.config.vocab_size,
+            quant_config=None,
+        )
+        self.log_replacement("input embedding",
+                             self.model.get_input_embeddings(), new_module)
+        self.model.set_input_embeddings(new_module)
+
+    def forward(
+        self,
+        input_ids: torch.Tensor,
+        positions: torch.Tensor,
+        kv_caches: List[torch.Tensor],  # argument not used
+        attn_metadata: AttentionMetadata,
+        intermediate_tensors: Optional[IntermediateTensors] = None,
+        inputs_embeds: Optional[torch.Tensor] = None,
+    ) -> Union[torch.Tensor, IntermediateTensors]:
+        model_output = self.model(
+            input_ids[None, ...],
+            use_cache=False,
+            position_ids=positions[None, ...],
+            attn_metadata=attn_metadata,
+            intermediate_tensors=intermediate_tensors,
+            attention_instances=self.attention_instances,
+            return_dict=False)[0][0, ...]  # we remove batch dimension for now
+        return model_output
+
+    def compute_logits(
+        self,
+        hidden_states: torch.Tensor,
+        sampling_metadata: SamplingMetadata,
+    ) -> Optional[torch.Tensor]:
+        logits = self.logits_processor(self.lm_head, hidden_states,
+                                       sampling_metadata)
+        return logits
+
+    def sample(self, logits: torch.Tensor,
+               sampling_metadata: SamplingMetadata) -> Optional[SamplerOutput]:
+
+        next_tokens = self.sampler(logits, sampling_metadata)
+        return next_tokens
+
+    def load_weights(self, weights: Iterable[Tuple[str,
+                                                   torch.Tensor]]) -> Set[str]:
+        params_dict = dict(self.named_parameters())
+        loaded_params: Set[str] = set()
+        for name, loaded_weight in weights:
+            if name not in params_dict:
+                name = f"{self.model.base_model_prefix}.{name}"
+            if name in params_dict:
+                param = params_dict[name]
+                weight_loader = getattr(param, "weight_loader",
+                                        default_weight_loader)
+                weight_loader(param, loaded_weight)
+                loaded_params.add(name)
+        return loaded_params
diff --git a/vllm/model_executor/models/ultravox.py b/vllm/model_executor/models/ultravox.py
index d577e545a473b..5e86b15db7a8f 100644
--- a/vllm/model_executor/models/ultravox.py
+++ b/vllm/model_executor/models/ultravox.py
@@ -1,3 +1,5 @@
+# SPDX-License-Identifier: Apache-2.0
+
 # Adapted from https://github.com/fixie-ai/ultravox/blob/ecd58c4041030bae2ad15aa6bcf04ab43199ea02/ultravox/model/ultravox_model.py
 """PyTorch Ultravox model."""
 import math
@@ -333,10 +335,10 @@ class ModifiedWhisperEncoder(WhisperEncoder):
         return hidden_states
 
 
-@MULTIMODAL_REGISTRY.register_processor(UltravoxMultiModalProcessor,
-                                        info=UltravoxProcessingInfo,
-                                        dummy_inputs=UltravoxDummyInputsBuilder
-                                        )
+@MULTIMODAL_REGISTRY.register_processor(
+    UltravoxMultiModalProcessor,
+    info=UltravoxProcessingInfo,
+    dummy_inputs=UltravoxDummyInputsBuilder)
 class UltravoxModel(nn.Module, SupportsMultiModal, SupportsPP):
 
     hf_to_vllm_mapper = WeightsMapper(
diff --git a/vllm/model_executor/models/utils.py b/vllm/model_executor/models/utils.py
index 43b3c973c97b8..fff4be34ddbeb 100644
--- a/vllm/model_executor/models/utils.py
+++ b/vllm/model_executor/models/utils.py
@@ -1,3 +1,5 @@
+# SPDX-License-Identifier: Apache-2.0
+
 import itertools
 from dataclasses import dataclass, field
 from typing import (Callable, Dict, Iterable, List, Literal, Mapping, Optional,
@@ -599,9 +601,8 @@ def make_empty_intermediate_tensors_factory(keys: List[str], hidden_size: int):
         device: torch.device,
     ) -> IntermediateTensors:
         return IntermediateTensors({
-            key: torch.zeros((batch_size, hidden_size),
-                             dtype=dtype,
-                             device=device)
+            key:
+            torch.zeros((batch_size, hidden_size), dtype=dtype, device=device)
             for key in keys
         })
 
diff --git a/vllm/model_executor/models/vision.py b/vllm/model_executor/models/vision.py
index 57166f05cd9bf..0d67ee7bb5ddf 100644
--- a/vllm/model_executor/models/vision.py
+++ b/vllm/model_executor/models/vision.py
@@ -1,3 +1,5 @@
+# SPDX-License-Identifier: Apache-2.0
+
 from abc import ABC, abstractmethod
 from typing import Final, Generic, Optional, Protocol, TypeVar, Union
 
diff --git a/vllm/model_executor/models/whisper.py b/vllm/model_executor/models/whisper.py
index b8512b735da94..2319c31609308 100644
--- a/vllm/model_executor/models/whisper.py
+++ b/vllm/model_executor/models/whisper.py
@@ -1,3 +1,5 @@
+# SPDX-License-Identifier: Apache-2.0
+
 import math
 from typing import (Iterable, List, Mapping, Optional, Set, Tuple, TypedDict,
                     Union)
@@ -743,7 +745,7 @@ def _create_fake_bias_for_k_proj(
     So that the bias for k_proj in qkv_proj can be initialized with zeros.
     """
     for name, weight in weights:
-        if ".self_attn.k_proj.weight" in name:
+        if name.endswith(".self_attn.k_proj.weight"):
             bias = torch.zeros(weight.size(0))
             bias_name = name.replace("weight", "bias")
             yield from [(name, weight), (bias_name, bias)]
diff --git a/vllm/model_executor/parameter.py b/vllm/model_executor/parameter.py
index a9ce8af15d3bb..2b1294bf7baa3 100644
--- a/vllm/model_executor/parameter.py
+++ b/vllm/model_executor/parameter.py
@@ -1,3 +1,5 @@
+# SPDX-License-Identifier: Apache-2.0
+
 from fractions import Fraction
 from typing import Callable, Optional, Union
 
diff --git a/vllm/model_executor/pooling_metadata.py b/vllm/model_executor/pooling_metadata.py
index b86cafce85d12..dea8b0e9d471d 100644
--- a/vllm/model_executor/pooling_metadata.py
+++ b/vllm/model_executor/pooling_metadata.py
@@ -1,3 +1,5 @@
+# SPDX-License-Identifier: Apache-2.0
+
 from dataclasses import dataclass
 from typing import Any, Dict, List, Tuple
 
diff --git a/vllm/model_executor/sampling_metadata.py b/vllm/model_executor/sampling_metadata.py
index 1df8f84ed4093..0a580a4e907de 100644
--- a/vllm/model_executor/sampling_metadata.py
+++ b/vllm/model_executor/sampling_metadata.py
@@ -1,3 +1,5 @@
+# SPDX-License-Identifier: Apache-2.0
+
 from array import array
 from dataclasses import dataclass
 from typing import Dict, List, Optional, Tuple
@@ -166,7 +168,8 @@ class SamplingMetadata:
             pin_memory=pin_memory,
         )
         categorized_sample_indices = {
-            t: async_tensor_h2d(
+            t:
+            async_tensor_h2d(
                 seq_ids,
                 dtype=torch.int,
                 target_device=device,
@@ -198,8 +201,12 @@ def _prepare_seq_groups(
     device: str,
     generators: Optional[Dict[str, torch.Generator]] = None,
     cache: Optional[SamplingMetadataCache] = None,
-) -> Tuple[List[SequenceGroupToSample], List[int], Dict[SamplingType,
-                                                        List[int]], int, ]:
+) -> Tuple[
+        List[SequenceGroupToSample],
+        List[int],
+        Dict[SamplingType, List[int]],
+        int,
+]:
     """Prepare sequence groups and indices for sampling.
 
     Args:
diff --git a/vllm/model_executor/utils.py b/vllm/model_executor/utils.py
index 6f1cc9d5e0c30..04f922dfd77aa 100644
--- a/vllm/model_executor/utils.py
+++ b/vllm/model_executor/utils.py
@@ -1,3 +1,4 @@
+# SPDX-License-Identifier: Apache-2.0
 """Utils for model executor."""
 from typing import Any, Dict, Optional
 
diff --git a/vllm/multimodal/__init__.py b/vllm/multimodal/__init__.py
index 1d7f5d57fa24e..741bd1a6a1c12 100644
--- a/vllm/multimodal/__init__.py
+++ b/vllm/multimodal/__init__.py
@@ -1,3 +1,5 @@
+# SPDX-License-Identifier: Apache-2.0
+
 from .base import MultiModalPlaceholderMap, MultiModalPlugin
 from .hasher import MultiModalHashDict, MultiModalHasher
 from .inputs import (BatchedTensorInputs, ModalityData, MultiModalDataBuiltins,
diff --git a/vllm/multimodal/audio.py b/vllm/multimodal/audio.py
index de80f22bac2a3..f379ec1682a3c 100644
--- a/vllm/multimodal/audio.py
+++ b/vllm/multimodal/audio.py
@@ -1,3 +1,5 @@
+# SPDX-License-Identifier: Apache-2.0
+
 import base64
 from io import BytesIO
 from pathlib import Path
diff --git a/vllm/multimodal/base.py b/vllm/multimodal/base.py
index fd3ec7e0ec8ce..c48d07ba365ba 100644
--- a/vllm/multimodal/base.py
+++ b/vllm/multimodal/base.py
@@ -1,3 +1,5 @@
+# SPDX-License-Identifier: Apache-2.0
+
 from abc import ABC, abstractmethod
 from collections import defaultdict
 from pathlib import Path
diff --git a/vllm/multimodal/hasher.py b/vllm/multimodal/hasher.py
index 24aa1ca658048..7d277fd67deca 100644
--- a/vllm/multimodal/hasher.py
+++ b/vllm/multimodal/hasher.py
@@ -1,3 +1,5 @@
+# SPDX-License-Identifier: Apache-2.0
+
 import pickle
 from typing import TYPE_CHECKING, Iterable, Mapping, Optional
 
diff --git a/vllm/multimodal/image.py b/vllm/multimodal/image.py
index da13a381c4530..98ac8057e8f18 100644
--- a/vllm/multimodal/image.py
+++ b/vllm/multimodal/image.py
@@ -1,3 +1,5 @@
+# SPDX-License-Identifier: Apache-2.0
+
 import base64
 from functools import lru_cache
 from io import BytesIO
diff --git a/vllm/multimodal/inputs.py b/vllm/multimodal/inputs.py
index b35184f6855ab..eb52551bbdb7b 100644
--- a/vllm/multimodal/inputs.py
+++ b/vllm/multimodal/inputs.py
@@ -1,3 +1,5 @@
+# SPDX-License-Identifier: Apache-2.0
+
 from abc import ABC, abstractmethod
 from collections import UserDict, defaultdict
 from collections.abc import Mapping, Sequence
diff --git a/vllm/multimodal/parse.py b/vllm/multimodal/parse.py
index ccff0e857eec4..063f458b2c4d9 100644
--- a/vllm/multimodal/parse.py
+++ b/vllm/multimodal/parse.py
@@ -1,3 +1,5 @@
+# SPDX-License-Identifier: Apache-2.0
+
 from abc import ABC, abstractmethod
 from collections import UserDict
 from collections.abc import Callable, Iterator, Mapping, Sequence
diff --git a/vllm/multimodal/processing.py b/vllm/multimodal/processing.py
index 750646ac6e431..2ad42d1c1c057 100644
--- a/vllm/multimodal/processing.py
+++ b/vllm/multimodal/processing.py
@@ -1,3 +1,5 @@
+# SPDX-License-Identifier: Apache-2.0
+
 import re
 from abc import ABC, abstractmethod
 from collections import defaultdict
diff --git a/vllm/multimodal/profiling.py b/vllm/multimodal/profiling.py
index c68edaff80167..953c010003250 100644
--- a/vllm/multimodal/profiling.py
+++ b/vllm/multimodal/profiling.py
@@ -1,3 +1,5 @@
+# SPDX-License-Identifier: Apache-2.0
+
 from abc import ABC, abstractmethod
 from collections.abc import Mapping
 from dataclasses import dataclass, field
diff --git a/vllm/multimodal/registry.py b/vllm/multimodal/registry.py
index 7a4b85385cac9..29036691bfa49 100644
--- a/vllm/multimodal/registry.py
+++ b/vllm/multimodal/registry.py
@@ -1,3 +1,5 @@
+# SPDX-License-Identifier: Apache-2.0
+
 import functools
 from collections import UserDict
 from dataclasses import dataclass
diff --git a/vllm/multimodal/utils.py b/vllm/multimodal/utils.py
index 900bed5929b3d..583f536551243 100644
--- a/vllm/multimodal/utils.py
+++ b/vllm/multimodal/utils.py
@@ -1,3 +1,5 @@
+# SPDX-License-Identifier: Apache-2.0
+
 from functools import lru_cache
 from itertools import groupby
 from pathlib import Path
diff --git a/vllm/multimodal/video.py b/vllm/multimodal/video.py
index 1ad1f5abc27a2..78a2918e3ed3e 100644
--- a/vllm/multimodal/video.py
+++ b/vllm/multimodal/video.py
@@ -1,10 +1,11 @@
+# SPDX-License-Identifier: Apache-2.0
+
 import base64
 from functools import lru_cache, partial
 from io import BytesIO
 from pathlib import Path
 from typing import TYPE_CHECKING, Any, Dict, Optional
 
-import cv2
 import numpy as np
 import numpy.typing as npt
 from PIL import Image
@@ -93,6 +94,8 @@ def resize_video(frames: npt.NDArray, size: tuple[int, int]) -> npt.NDArray:
     new_height, new_width = size
     resized_frames = np.empty((num_frames, new_height, new_width, channels),
                               dtype=frames.dtype)
+    # lazy import cv2 to avoid bothering users who only use text models
+    import cv2
     for i, frame in enumerate(frames):
         resized_frame = cv2.resize(frame, (new_width, new_height))
         resized_frames[i] = resized_frame
diff --git a/vllm/outputs.py b/vllm/outputs.py
index 25b2265285d16..786380c37f6cb 100644
--- a/vllm/outputs.py
+++ b/vllm/outputs.py
@@ -1,3 +1,5 @@
+# SPDX-License-Identifier: Apache-2.0
+
 import time
 from dataclasses import dataclass
 from typing import Dict, Generic, List, MutableSequence, Optional
diff --git a/vllm/platforms/__init__.py b/vllm/platforms/__init__.py
index ddbdc43ca5710..9c98942b55699 100644
--- a/vllm/platforms/__init__.py
+++ b/vllm/platforms/__init__.py
@@ -1,3 +1,5 @@
+# SPDX-License-Identifier: Apache-2.0
+
 import logging
 import traceback
 from itertools import chain
@@ -31,7 +33,8 @@ def cuda_platform_plugin() -> Optional[str]:
     is_cuda = False
 
     try:
-        import pynvml
+        from vllm.utils import import_pynvml
+        pynvml = import_pynvml()
         pynvml.nvmlInit()
         try:
             if pynvml.nvmlDeviceGetCount() > 0:
diff --git a/vllm/platforms/cpu.py b/vllm/platforms/cpu.py
index 74948202cbe48..4e0683b8a2de1 100644
--- a/vllm/platforms/cpu.py
+++ b/vllm/platforms/cpu.py
@@ -1,3 +1,5 @@
+# SPDX-License-Identifier: Apache-2.0
+
 import os
 from typing import TYPE_CHECKING, Optional
 
@@ -31,7 +33,8 @@ class CpuPlatform(Platform):
     @classmethod
     def get_attn_backend_cls(cls, selected_backend: _Backend, head_size: int,
                              dtype: torch.dtype, kv_cache_dtype: Optional[str],
-                             block_size: int, use_v1: bool) -> str:
+                             block_size: int, use_v1: bool,
+                             use_mla: bool) -> str:
         if selected_backend != _Backend.TORCH_SDPA:
             logger.info("Cannot use %s backend on CPU.", selected_backend)
         logger.info("Using Torch SDPA backend.")
diff --git a/vllm/platforms/cuda.py b/vllm/platforms/cuda.py
index 2587e3a11dde3..b49852a727fa4 100644
--- a/vllm/platforms/cuda.py
+++ b/vllm/platforms/cuda.py
@@ -1,3 +1,4 @@
+# SPDX-License-Identifier: Apache-2.0
 """Code inside this file can safely assume cuda platform, e.g. importing
 pynvml. However, it should not initialize cuda context.
 """
@@ -7,7 +8,6 @@ from functools import lru_cache, wraps
 from typing import (TYPE_CHECKING, Callable, List, Optional, Tuple, TypeVar,
                     Union)
 
-import pynvml
 import torch
 from typing_extensions import ParamSpec
 
@@ -15,6 +15,7 @@ from typing_extensions import ParamSpec
 import vllm._C  # noqa
 import vllm.envs as envs
 from vllm.logger import init_logger
+from vllm.utils import import_pynvml
 
 from .interface import DeviceCapability, Platform, PlatformEnum, _Backend
 
@@ -28,13 +29,7 @@ logger = init_logger(__name__)
 _P = ParamSpec("_P")
 _R = TypeVar("_R")
 
-if pynvml.__file__.endswith("__init__.py"):
-    logger.warning(
-        "You are using a deprecated `pynvml` package. Please install"
-        " `nvidia-ml-py` instead, and make sure to uninstall `pynvml`."
-        " When both of them are installed, `pynvml` will take precedence"
-        " and cause errors. See https://pypi.org/project/pynvml "
-        "for more information.")
+pynvml = import_pynvml()
 
 # pytorch 2.5 uses cudnn sdpa by default, which will cause crash on some models
 # see https://github.com/huggingface/diffusers/issues/9704 for details
@@ -120,13 +115,18 @@ class CudaPlatformBase(Platform):
         if parallel_config.worker_cls == "auto":
             if scheduler_config.is_multi_step:
                 if envs.VLLM_USE_V1:
-                    raise NotImplementedError
+                    raise NotImplementedError(
+                        "Multi-step scheduling is not supported (and not "
+                        "needed) on VLLM V1. Please launch without "
+                        "--num-scheduler-steps.")
                 else:
                     parallel_config.worker_cls = \
                         "vllm.worker.multi_step_worker.MultiStepWorker"
             elif vllm_config.speculative_config:
                 if envs.VLLM_USE_V1:
-                    raise NotImplementedError
+                    raise NotImplementedError(
+                        "Speculative decoding is not yet supported on VLLM V1."
+                    )
                 else:
                     parallel_config.worker_cls = \
                         "vllm.spec_decode.spec_decode_worker.create_spec_worker"
@@ -152,10 +152,14 @@ class CudaPlatformBase(Platform):
 
     @classmethod
     def get_attn_backend_cls(cls, selected_backend, head_size, dtype,
-                             kv_cache_dtype, block_size, use_v1) -> str:
+                             kv_cache_dtype, block_size, use_v1,
+                             use_mla) -> str:
         if use_v1:
             logger.info("Using Flash Attention backend on V1 engine.")
             return "vllm.v1.attention.backends.flash_attn.FlashAttentionBackend"
+        if use_mla:
+            logger.info("Using Triton MLA backend.")
+            return "vllm.attention.backends.triton_mla.TritonMLABackend"
         if selected_backend == _Backend.FLASHINFER:
             logger.info("Using FlashInfer backend.")
             return "vllm.attention.backends.flashinfer.FlashInferBackend"
@@ -166,7 +170,8 @@ class CudaPlatformBase(Platform):
             pass
         elif selected_backend:
             raise ValueError(
-                f"Invalid attention backend for {cls.device_name}")
+                f"Invalid attention backend for {cls.device_name}, "
+                f"with use_v1: {use_v1} use_mla: {use_mla}")
 
         target_backend = _Backend.FLASH_ATTN
         if not cls.has_device_capability(80):
diff --git a/vllm/platforms/hpu.py b/vllm/platforms/hpu.py
index a32c262c84efa..78ddb67bb3fa3 100644
--- a/vllm/platforms/hpu.py
+++ b/vllm/platforms/hpu.py
@@ -1,3 +1,5 @@
+# SPDX-License-Identifier: Apache-2.0
+
 import os
 from typing import TYPE_CHECKING, Optional
 
@@ -27,7 +29,8 @@ class HpuPlatform(Platform):
     @classmethod
     def get_attn_backend_cls(cls, selected_backend: _Backend, head_size: int,
                              dtype: torch.dtype, kv_cache_dtype: Optional[str],
-                             block_size: int, use_v1: bool) -> str:
+                             block_size: int, use_v1: bool,
+                             use_mla: bool) -> str:
         logger.info("Using HPUAttention backend.")
         return "vllm.attention.backends.hpu_attn.HPUAttentionBackend"
 
diff --git a/vllm/platforms/interface.py b/vllm/platforms/interface.py
index f2ecec3203fb7..dc6545c933de3 100644
--- a/vllm/platforms/interface.py
+++ b/vllm/platforms/interface.py
@@ -1,3 +1,5 @@
+# SPDX-License-Identifier: Apache-2.0
+
 import enum
 import platform
 import random
@@ -30,6 +32,7 @@ class _Backend(enum.Enum):
     TORCH_SDPA = enum.auto()
     OPENVINO = enum.auto()
     FLASHINFER = enum.auto()
+    TRITON_MLA = enum.auto()
     HPU_ATTN = enum.auto()
     PALLAS = enum.auto()
     IPEX = enum.auto()
@@ -139,7 +142,8 @@ class Platform:
     @classmethod
     def get_attn_backend_cls(cls, selected_backend: _Backend, head_size: int,
                              dtype: torch.dtype, kv_cache_dtype: Optional[str],
-                             block_size: int, use_v1: bool) -> str:
+                             block_size: int, use_v1: bool,
+                             use_mla: bool) -> str:
         """Get the attention backend class of a device."""
         return ""
 
diff --git a/vllm/platforms/neuron.py b/vllm/platforms/neuron.py
index ead3dab05a6b1..5a03f5f7acbc1 100644
--- a/vllm/platforms/neuron.py
+++ b/vllm/platforms/neuron.py
@@ -1,3 +1,5 @@
+# SPDX-License-Identifier: Apache-2.0
+
 from typing import TYPE_CHECKING, Optional
 
 from vllm.logger import init_logger
@@ -38,8 +40,8 @@ class NeuronPlatform(Platform):
         if parallel_config.world_size > 1:
             parallel_config.distributed_executor_backend = "uni"
 
-        assert (vllm_config.lora_config is
-                None), "LoRA is not supported for Neuron backend."
+        assert (vllm_config.lora_config
+                is None), "LoRA is not supported for Neuron backend."
         assert (not vllm_config.speculative_config
                 ), "Speculative decoding not yet supported for Neuron backend."
 
diff --git a/vllm/platforms/openvino.py b/vllm/platforms/openvino.py
index 7d414165a8188..41221de0afe50 100644
--- a/vllm/platforms/openvino.py
+++ b/vllm/platforms/openvino.py
@@ -1,3 +1,5 @@
+# SPDX-License-Identifier: Apache-2.0
+
 from typing import TYPE_CHECKING, Optional
 
 import torch
@@ -30,7 +32,8 @@ class OpenVinoPlatform(Platform):
     @classmethod
     def get_attn_backend_cls(cls, selected_backend: _Backend, head_size: int,
                              dtype: torch.dtype, kv_cache_dtype: Optional[str],
-                             block_size: int, use_v1: bool) -> str:
+                             block_size: int, use_v1: bool,
+                             use_mla: bool) -> str:
         if selected_backend != _Backend.OPENVINO:
             logger.info("Cannot use %s backend on OpenVINO.", selected_backend)
         logger.info("Using OpenVINO Attention backend.")
diff --git a/vllm/platforms/rocm.py b/vllm/platforms/rocm.py
index 5ef56406e1935..cd851c0d87a7d 100644
--- a/vllm/platforms/rocm.py
+++ b/vllm/platforms/rocm.py
@@ -1,3 +1,5 @@
+# SPDX-License-Identifier: Apache-2.0
+
 import os
 from functools import lru_cache
 from typing import TYPE_CHECKING, Dict, List, Optional
@@ -75,7 +77,8 @@ class RocmPlatform(Platform):
 
     @classmethod
     def get_attn_backend_cls(cls, selected_backend, head_size, dtype,
-                             kv_cache_dtype, block_size, use_v1) -> str:
+                             kv_cache_dtype, block_size, use_v1,
+                             use_mla) -> str:
         selected_backend = (_Backend.ROCM_FLASH if selected_backend
                             == _Backend.FLASH_ATTN else selected_backend)
         if selected_backend == _Backend.ROCM_FLASH:
diff --git a/vllm/platforms/tpu.py b/vllm/platforms/tpu.py
index 05a3aa4305cfa..fffc61bbaaca0 100644
--- a/vllm/platforms/tpu.py
+++ b/vllm/platforms/tpu.py
@@ -1,3 +1,5 @@
+# SPDX-License-Identifier: Apache-2.0
+
 from typing import TYPE_CHECKING, Optional
 
 import torch
@@ -29,7 +31,8 @@ class TpuPlatform(Platform):
     @classmethod
     def get_attn_backend_cls(cls, selected_backend: _Backend, head_size: int,
                              dtype: torch.dtype, kv_cache_dtype: Optional[str],
-                             block_size: int, use_v1: bool) -> str:
+                             block_size: int, use_v1: bool,
+                             use_mla: bool) -> str:
         if selected_backend != _Backend.PALLAS:
             logger.info("Cannot use %s backend on TPU.", selected_backend)
         logger.info("Using Pallas backend.")
diff --git a/vllm/platforms/xpu.py b/vllm/platforms/xpu.py
index c34b5b58672e7..81bc85f9415e8 100644
--- a/vllm/platforms/xpu.py
+++ b/vllm/platforms/xpu.py
@@ -1,3 +1,5 @@
+# SPDX-License-Identifier: Apache-2.0
+
 from typing import TYPE_CHECKING, Optional
 
 import torch
@@ -27,7 +29,8 @@ class XPUPlatform(Platform):
     @classmethod
     def get_attn_backend_cls(cls, selected_backend: _Backend, head_size: int,
                              dtype: torch.dtype, kv_cache_dtype: Optional[str],
-                             block_size: int, use_v1: bool) -> str:
+                             block_size: int, use_v1: bool,
+                             use_mla: bool) -> str:
         if selected_backend != _Backend.IPEX:
             logger.info("Cannot use %s backend on XPU.", selected_backend)
         logger.info("Using IPEX attention backend.")
@@ -65,9 +68,14 @@ class XPUPlatform(Platform):
         # check and update model config
         model_config = vllm_config.model_config
         if model_config.dtype == torch.bfloat16:
-            logger.warning(
-                "bfloat16 is not fully supported on XPU, casting to float16.")
-            model_config.dtype = torch.float16
+            bf16_supported = cls.device_support_bf16()
+            if not bf16_supported:
+                logger.warning(
+                    "bfloat16 is only supported on Intel Data Center GPU, "
+                    "Intel Arc GPU is not supported yet. Your device is %s,"
+                    "which is not supported. will fallback to float16",
+                    cls.get_device_name())
+                model_config.dtype = torch.float16
         if not model_config.enforce_eager:
             logger.warning(
                 "CUDA graph is not supported on XPU, fallback to the eager "
@@ -115,3 +123,15 @@ class XPUPlatform(Platform):
                                  ) -> float:
         torch.xpu.reset_peak_memory_stats(device)
         return torch.xpu.max_memory_allocated(device)
+
+    @classmethod
+    def device_support_bf16(cls) -> bool:
+        device_name = cls.get_device_name().lower()
+        if device_name.count("arc") > 0:
+            return False
+        elif device_name.count("data center gpu") > 0:
+            return True
+        else:
+            logger.warning("Unknown device name %s, always use float16",
+                           device_name)
+            return False
diff --git a/vllm/plugins/__init__.py b/vllm/plugins/__init__.py
index a78a054917756..389cb87281031 100644
--- a/vllm/plugins/__init__.py
+++ b/vllm/plugins/__init__.py
@@ -1,3 +1,5 @@
+# SPDX-License-Identifier: Apache-2.0
+
 import logging
 import os
 from typing import Callable, Dict
diff --git a/vllm/pooling_params.py b/vllm/pooling_params.py
index b24b7e91a7ae7..061232eb11830 100644
--- a/vllm/pooling_params.py
+++ b/vllm/pooling_params.py
@@ -1,3 +1,5 @@
+# SPDX-License-Identifier: Apache-2.0
+
 from typing import Any, Optional
 
 import msgspec
diff --git a/vllm/profiler/__init__.py b/vllm/profiler/__init__.py
index 3e25f5cc283f2..00af72b1d41fc 100644
--- a/vllm/profiler/__init__.py
+++ b/vllm/profiler/__init__.py
@@ -1,3 +1,5 @@
+# SPDX-License-Identifier: Apache-2.0
+
 from .layerwise_profile import layerwise_profile
 
 __all__ = [
diff --git a/vllm/profiler/layerwise_profile.py b/vllm/profiler/layerwise_profile.py
index 29c0edd0ee535..6351ef63da2be 100644
--- a/vllm/profiler/layerwise_profile.py
+++ b/vllm/profiler/layerwise_profile.py
@@ -1,3 +1,5 @@
+# SPDX-License-Identifier: Apache-2.0
+
 import copy
 from collections import defaultdict
 from dataclasses import asdict, dataclass, field
diff --git a/vllm/profiler/utils.py b/vllm/profiler/utils.py
index 033035e434325..62b39f510703e 100644
--- a/vllm/profiler/utils.py
+++ b/vllm/profiler/utils.py
@@ -1,3 +1,5 @@
+# SPDX-License-Identifier: Apache-2.0
+
 import dataclasses
 from typing import Callable, Dict, List, Type, Union
 
diff --git a/vllm/prompt_adapter/layers.py b/vllm/prompt_adapter/layers.py
index 27a61e692e1b7..c2f9f16919b7f 100644
--- a/vllm/prompt_adapter/layers.py
+++ b/vllm/prompt_adapter/layers.py
@@ -1,3 +1,5 @@
+# SPDX-License-Identifier: Apache-2.0
+
 from dataclasses import dataclass
 from typing import Optional
 
diff --git a/vllm/prompt_adapter/models.py b/vllm/prompt_adapter/models.py
index 18a5f86c341a9..3ba7d0896f95a 100644
--- a/vllm/prompt_adapter/models.py
+++ b/vllm/prompt_adapter/models.py
@@ -1,3 +1,5 @@
+# SPDX-License-Identifier: Apache-2.0
+
 import logging
 import math
 from typing import Any, Callable, Dict, List, Optional, Type
diff --git a/vllm/prompt_adapter/request.py b/vllm/prompt_adapter/request.py
index 775dd11db0719..dfb8e61d786a0 100644
--- a/vllm/prompt_adapter/request.py
+++ b/vllm/prompt_adapter/request.py
@@ -1,3 +1,5 @@
+# SPDX-License-Identifier: Apache-2.0
+
 import msgspec
 
 from vllm.adapter_commons.request import AdapterRequest
diff --git a/vllm/prompt_adapter/utils.py b/vllm/prompt_adapter/utils.py
index 8b2732923c4e7..dd179ab938f83 100644
--- a/vllm/prompt_adapter/utils.py
+++ b/vllm/prompt_adapter/utils.py
@@ -1,3 +1,5 @@
+# SPDX-License-Identifier: Apache-2.0
+
 # code borrowed from: https://github.com/huggingface/peft/blob/v0.12.0/src/peft/utils/save_and_load.py#L420
 
 import os
diff --git a/vllm/prompt_adapter/worker_manager.py b/vllm/prompt_adapter/worker_manager.py
index ddc1ef893c6f2..28dcc16871120 100644
--- a/vllm/prompt_adapter/worker_manager.py
+++ b/vllm/prompt_adapter/worker_manager.py
@@ -1,3 +1,5 @@
+# SPDX-License-Identifier: Apache-2.0
+
 import logging
 from typing import Any, Optional, Set, Type
 
diff --git a/vllm/sampling_params.py b/vllm/sampling_params.py
index 605c09b8d7225..97f9e21295731 100644
--- a/vllm/sampling_params.py
+++ b/vllm/sampling_params.py
@@ -1,3 +1,4 @@
+# SPDX-License-Identifier: Apache-2.0
 """Sampling parameters for text generation."""
 import copy
 from dataclasses import dataclass
diff --git a/vllm/scalar_type.py b/vllm/scalar_type.py
index 9d711b0debcd8..9f6e85920ac7b 100644
--- a/vllm/scalar_type.py
+++ b/vllm/scalar_type.py
@@ -1,3 +1,5 @@
+# SPDX-License-Identifier: Apache-2.0
+
 import functools
 import struct
 from dataclasses import dataclass
@@ -121,8 +123,8 @@ class ScalarType:
             min_raw = max_raw | sign_bit_double
             return struct.unpack('!d', struct.pack('!Q', min_raw))[0]
         else:
-            assert (not self.is_signed() or
-                    self.size_bits <= 64), "Cannot represent min as a int64_t"
+            assert (not self.is_signed() or self.size_bits
+                    <= 64), "Cannot represent min as a int64_t"
 
             if self.is_signed():
                 return -(1 << (self.size_bits - 1))
diff --git a/vllm/scripts.py b/vllm/scripts.py
index 42e1c639eda10..467cab28f02ae 100644
--- a/vllm/scripts.py
+++ b/vllm/scripts.py
@@ -1,3 +1,5 @@
+# SPDX-License-Identifier: Apache-2.0
+
 # The CLI entrypoint to vLLM.
 import argparse
 import os
@@ -167,6 +169,7 @@ def main():
         "Must be a YAML with the following options:"
         "https://docs.vllm.ai/en/latest/serving/openai_compatible_server.html#cli-reference"
     )
+
     serve_parser = make_arg_parser(serve_parser)
     serve_parser.set_defaults(dispatch_function=serve)
 
diff --git a/vllm/sequence.py b/vllm/sequence.py
index 74320db709f94..534b9e60610a2 100644
--- a/vllm/sequence.py
+++ b/vllm/sequence.py
@@ -1,3 +1,4 @@
+# SPDX-License-Identifier: Apache-2.0
 """Sequence and its related classes."""
 import copy
 import enum
diff --git a/vllm/spec_decode/batch_expansion.py b/vllm/spec_decode/batch_expansion.py
index 01b9cdad963da..e08ed742a5225 100644
--- a/vllm/spec_decode/batch_expansion.py
+++ b/vllm/spec_decode/batch_expansion.py
@@ -1,3 +1,5 @@
+# SPDX-License-Identifier: Apache-2.0
+
 from array import array
 from itertools import chain, count
 from typing import Iterator, List, Optional, Tuple
@@ -83,13 +85,13 @@ class BatchExpansionTop1Scorer(SpeculativeScorer):
 
         if not non_spec_indices:
             # All sequence groups in batch have spec decoding enabled
-            contracted = self._contract_batch_all_spec(
+            return self._contract_batch_all_spec(
                 target_sampler_output=target_sampler_output,
                 proposals=proposals,
             )
         else:
             # Batch has a mix of spec decode enabled and disabled seq groups
-            contracted = self._contract_batch(
+            return self._contract_batch(
                 execute_model_req.seq_group_metadata_list,
                 target_sampler_output=target_sampler_output,
                 proposals=proposals,
@@ -99,14 +101,6 @@ class BatchExpansionTop1Scorer(SpeculativeScorer):
                 k=execute_model_req.num_lookahead_slots,
             )
 
-        all_tokens, all_probs, spec_logprobs, all_hidden_states = contracted
-        return SpeculativeScores(
-            probs=all_probs,
-            token_ids=all_tokens,
-            logprobs=spec_logprobs,
-            hidden_states=all_hidden_states,
-        )
-
     def _expand_batch(
         self,
         seq_group_metadata_list: List[SequenceGroupMetadata],
@@ -143,13 +137,57 @@ class BatchExpansionTop1Scorer(SpeculativeScorer):
         return (spec_indices, non_spec_indices, target_seq_group_metadata_list,
                 num_scoring_tokens)
 
+    def _contract_non_speculative(
+            self, scores: SpeculativeScores,
+            seq_group_metadata_list: List[SequenceGroupMetadata],
+            non_spec_indices: List[int], non_spec_outputs: SpeculativeScores,
+            has_prompt_log: bool) -> SpeculativeScores:
+        """
+            Augment input `scores` with non-speculative requests outputs. 
+            This includes decode requests with speculation turned off, as well
+            as prefill requests when `enable_chunked_prefill` is set.
+            For the latter, prefills are further separated into terminal and 
+            non-terminal chunks (from which no token is sampled).
+        """
+        if not non_spec_indices:
+            return scores
+
+        if has_prompt_log:
+            # When prompt_logprobs is enabled, prefills yield output token
+            # (and respective prob) in the last entry (prompt|out):
+            # [.|.|.|prefill0_out|.|prefill1_out|decode0_out|..].
+            # With chunked prefill, non-terminal chunks have -1 on each
+            # position: they're still picked, but they're discarded later.
+            seq_meta = seq_group_metadata_list
+            nospec_sizes = torch.tensor([
+                seq_meta[i].token_chunk_size if seq_meta[i].is_prompt else 1
+                for i in non_spec_indices
+            ])
+            nospec_sampled_token_idxs = torch.cumsum(nospec_sizes, 0).add_(-1)
+        else:
+            # In this case only sampled tokens are returned, select all.
+            nospec_sampled_token_idxs = list(
+                range(len(non_spec_outputs.token_ids)))
+
+        scores.token_ids[non_spec_indices, :1] = \
+            non_spec_outputs.token_ids[nospec_sampled_token_idxs].unsqueeze(1)
+        scores.probs[non_spec_indices, :1, :] = \
+            non_spec_outputs.probs[nospec_sampled_token_idxs].unsqueeze(1)
+        scores.logprobs[non_spec_indices, :1, :] = \
+            non_spec_outputs.logprobs[nospec_sampled_token_idxs].unsqueeze(1)
+        if scores.hidden_states is not None:
+            assert non_spec_outputs.hidden_states is not None
+            scores.hidden_states[non_spec_indices, :1, :] = \
+                non_spec_outputs.hidden_states[nospec_sampled_token_idxs].unsqueeze(1)
+        return scores
+
     def _contract_batch(
-        self, contracted_seq_group_metadata_list: List[SequenceGroupMetadata],
-        target_sampler_output: SamplerOutput, proposals: SpeculativeProposals,
-        num_scoring_tokens: int, non_spec_indices: List[int],
-        spec_indices: List[int], k: int
-    ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor,
-               Optional[torch.Tensor]]:
+            self,
+            contracted_seq_group_metadata_list: List[SequenceGroupMetadata],
+            target_sampler_output: SamplerOutput,
+            proposals: SpeculativeProposals, num_scoring_tokens: int,
+            non_spec_indices: List[int], spec_indices: List[int],
+            k: int) -> SpeculativeScores:
         """Contract the expanded batch back into its original size.
         This maps the scores of speculative tokens back to their original
         sequences.
@@ -195,23 +233,28 @@ class BatchExpansionTop1Scorer(SpeculativeScorer):
         else:
             all_hidden_states = None
 
-        # Rule out prefills that produce no tokens.
-        non_spec_indices = [
-            idx for idx in non_spec_indices
-            if contracted_seq_group_metadata_list[idx].do_sample
-        ]
-        if len(non_spec_indices):
-            all_tokens[non_spec_indices, :1] = \
-                non_spec_target_token_ids.unsqueeze(1)
-            all_probs[non_spec_indices, :1, :] = \
-                non_spec_target_probs.unsqueeze(1)
-            all_logprobs[non_spec_indices, :1, :] = \
-                non_spec_target_logprobs.unsqueeze(1)
-            if all_hidden_states is not None:
-                assert non_spec_target_hidden_states is not None
-                all_hidden_states[non_spec_indices, :1, :] = \
-                    non_spec_target_hidden_states.unsqueeze(1)
+        has_prompt_log = any((sg.sampling_params.prompt_logprobs
+                              and sg.sampling_params.prompt_logprobs > 0)
+                             for sg in contracted_seq_group_metadata_list)
+        # When prompt logprobs is enabled, lens of returned tensors go from
+        # n_sampled (requests with do_sample=True) to n_prompt+n_prefills.
+        # We adjust stride accordingly to get the generated tokens and
+        # their probs, but pass on prompt_logprobs as is.
+        prompt_logprobs = None
+        if (not self._scorer_worker.model_runner.disable_logprobs\
+            and has_prompt_log):
+            prompt_logprobs = [
+                o.prompt_logprobs for o in target_sampler_output.outputs
+            ]
+        elif not has_prompt_log:
+            # When prompt logprobs are not to be returned,
+            # we can ignore non-terminal chunks (no out token).
+            non_spec_indices = [
+                idx for idx in non_spec_indices
+                if contracted_seq_group_metadata_list[idx].do_sample
+            ]
 
+        # "Contract" speculative.
         if spec_indices:
             all_tokens[spec_indices] = target_token_ids
             all_probs[spec_indices] = target_probs
@@ -219,14 +262,27 @@ class BatchExpansionTop1Scorer(SpeculativeScorer):
             if all_hidden_states is not None:
                 all_hidden_states[spec_indices] = target_hidden_states
 
-        return all_tokens, all_probs, all_logprobs, all_hidden_states
+        spec_scores = SpeculativeScores(probs=all_probs,
+                                        token_ids=all_tokens,
+                                        logprobs=all_logprobs,
+                                        hidden_states=all_hidden_states,
+                                        prompt_logprobs=prompt_logprobs)
+
+        non_spec_outputs = SpeculativeScores(
+            probs=non_spec_target_probs,
+            token_ids=non_spec_target_token_ids,
+            logprobs=non_spec_target_logprobs,
+            hidden_states=non_spec_target_hidden_states)
+        # Contract remaining nonspec entries based on non_spec_indices, if any.
+        return self._contract_non_speculative(
+            spec_scores, contracted_seq_group_metadata_list, non_spec_indices,
+            non_spec_outputs, has_prompt_log)
 
     def _contract_batch_all_spec(
         self,
         target_sampler_output: SamplerOutput,
         proposals: SpeculativeProposals,
-    ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor,
-               Optional[torch.Tensor]]:
+    ) -> SpeculativeScores:
         """Contract the expanded batch back into its original size.
         This maps the scores of speculative tokens back to their original
         sequences.
@@ -250,8 +306,11 @@ class BatchExpansionTop1Scorer(SpeculativeScorer):
             target_hidden_states = target_hidden_states.reshape(
                 *target_token_ids.shape, target_hidden_states.shape[-1])
 
-        return (target_token_ids, target_probs, target_logprobs,
-                target_hidden_states)
+        return SpeculativeScores(probs=target_probs,
+                                 token_ids=target_token_ids,
+                                 logprobs=target_logprobs,
+                                 hidden_states=target_hidden_states,
+                                 prompt_logprobs=None)
 
     def _create_scoring_model_input(
         self,
diff --git a/vllm/spec_decode/draft_model_runner.py b/vllm/spec_decode/draft_model_runner.py
index fe5fd39f42ac9..3948298db40c2 100644
--- a/vllm/spec_decode/draft_model_runner.py
+++ b/vllm/spec_decode/draft_model_runner.py
@@ -1,3 +1,5 @@
+# SPDX-License-Identifier: Apache-2.0
+
 from typing import List, Optional
 
 import torch
diff --git a/vllm/spec_decode/interfaces.py b/vllm/spec_decode/interfaces.py
index a4fe0f13c8db1..dd085ad776384 100644
--- a/vllm/spec_decode/interfaces.py
+++ b/vllm/spec_decode/interfaces.py
@@ -1,10 +1,12 @@
+# SPDX-License-Identifier: Apache-2.0
+
 from abc import ABC, abstractmethod
 from dataclasses import dataclass
-from typing import Optional, Set, Union
+from typing import List, Optional, Set, Union
 
 import torch
 
-from vllm.sequence import ExecuteModelRequest
+from vllm.sequence import ExecuteModelRequest, PromptLogprobs
 from vllm.worker.worker_base import WorkerBase
 
 
@@ -54,6 +56,10 @@ class SpeculativeScores:
     # Optional last hidden states from the scoring model.
     hidden_states: Optional[torch.Tensor] = None
 
+    # Scoring model may also return logprobs for prompt tokens
+    # for each request, when chunked prefill is enabled.
+    prompt_logprobs: Optional[List[PromptLogprobs]] = None
+
     def __repr__(self):
         return (f"SpeculativeScores("
                 f"probs={self.probs.shape}, "
diff --git a/vllm/spec_decode/medusa_worker.py b/vllm/spec_decode/medusa_worker.py
index 21a58fc426275..0b62a988e8b26 100644
--- a/vllm/spec_decode/medusa_worker.py
+++ b/vllm/spec_decode/medusa_worker.py
@@ -1,3 +1,5 @@
+# SPDX-License-Identifier: Apache-2.0
+
 import weakref
 from typing import List, Optional, Set, Tuple
 
diff --git a/vllm/spec_decode/metrics.py b/vllm/spec_decode/metrics.py
index d678f4578499b..bc0e0a121cd55 100644
--- a/vllm/spec_decode/metrics.py
+++ b/vllm/spec_decode/metrics.py
@@ -1,3 +1,5 @@
+# SPDX-License-Identifier: Apache-2.0
+
 import time
 from typing import Callable, Optional, Union
 
diff --git a/vllm/spec_decode/mlp_speculator_worker.py b/vllm/spec_decode/mlp_speculator_worker.py
index fc41bb82ea340..bdaf31895e25d 100644
--- a/vllm/spec_decode/mlp_speculator_worker.py
+++ b/vllm/spec_decode/mlp_speculator_worker.py
@@ -1,3 +1,5 @@
+# SPDX-License-Identifier: Apache-2.0
+
 from typing import List, Optional, Set, Tuple
 
 import torch
diff --git a/vllm/spec_decode/mqa_scorer.py b/vllm/spec_decode/mqa_scorer.py
index cbf793e2043e3..6275c460ecefa 100644
--- a/vllm/spec_decode/mqa_scorer.py
+++ b/vllm/spec_decode/mqa_scorer.py
@@ -1,3 +1,5 @@
+# SPDX-License-Identifier: Apache-2.0
+
 from vllm.sequence import (ExecuteModelRequest, SequenceData,
                            SequenceGroupMetadata, get_all_seq_ids)
 from vllm.spec_decode.interfaces import (SpeculativeProposals,
@@ -72,9 +74,15 @@ class MQAScorer(SpeculativeScorer):
         target_token_ids = target_sampler_output.sampled_token_ids
         target_probs = target_sampler_output.sampled_token_probs
         target_logprobs = target_sampler_output.logprobs
+        prompt_logprobs = None
+
         # If all requests have the same number of query tokens, we can avoid
         # the for loop to build output for better performance.
         if min(all_proposal_lengths) == k:
+            # Regular decodes only.
+            assert all(not sg.is_prompt
+                       for sg in target_seq_group_metadata_list
+                       if sg.is_prompt)
             bs, _ = proposals.proposal_token_ids.shape
             all_tokens = target_token_ids.reshape(bs, k + 1)
             all_probs = target_probs.reshape(bs, k + 1, self._vocab_size)
@@ -88,19 +96,56 @@ class MQAScorer(SpeculativeScorer):
             all_logprobs = target_logprobs.new_full(size=all_probs.shape,
                                                     fill_value=-float("inf"))
             target_token_ids = target_token_ids.flatten()
-            start_loc = 0
-            for i, (proposed_len, seq_meta) in enumerate(
-                    zip(all_proposal_lengths, target_seq_group_metadata_list)):
+
+            # When prompt logprobs is enabled, lens of returned tensors go from
+            # n_sampled (requests with do_sample=True) to n_prompt+n_prefills.
+            # We adjust stride accordingly to get the generated tokens and
+            # their probs, but pass on prompt_logprobs as is, since it may be
+            # that n_prompts >> K.
+            has_prompt_log = any((sg.sampling_params.prompt_logprobs
+                                  and sg.sampling_params.prompt_logprobs > 0)
+                                 for sg in target_seq_group_metadata_list)
+            # TODO (NickLucche) we should surface `disable_logprobs` as to not
+            # break abstraction to get its value.
+            if (not self._scorer_worker.model_runner.disable_logprobs\
+                and has_prompt_log):
+                prompt_logprobs = [
+                    o.prompt_logprobs for o in target_sampler_output.outputs
+                ]
+
+            # Split loop into prefill|decode for readability.
+            start_loc, i = 0, 0
+            while i < len(target_seq_group_metadata_list
+                          ) and target_seq_group_metadata_list[i].is_prompt:
+                seq_meta = target_seq_group_metadata_list[i]
+                end_loc = start_loc
+                if has_prompt_log:
+                    end_loc += seq_meta.token_chunk_size
+                elif seq_meta.do_sample:
+                    end_loc += 1
+
                 # Skip chunks with no output tokens.
                 if seq_meta.do_sample:
-                    output_len = proposed_len + 1
-                    end_loc = start_loc + output_len
-                    all_tokens[
-                        i, :output_len] = target_token_ids[start_loc:end_loc]
-                    all_probs[i, :output_len] = target_probs[start_loc:end_loc]
-                    all_logprobs[
-                        i, :output_len] = target_logprobs[start_loc:end_loc]
-                    start_loc = end_loc
+                    # Get sampled token (last position in chunk) and its prob.
+                    all_tokens[i, 0] = target_token_ids[end_loc - 1]
+                    all_probs[i, 0] = target_probs[end_loc - 1]
+                    all_logprobs[i, 0] = target_logprobs[end_loc - 1]
+
+                i += 1
+                start_loc = end_loc
+            # Decodes.
+            while i < len(target_seq_group_metadata_list):
+                proposed_len, seq_meta = all_proposal_lengths[
+                    i], target_seq_group_metadata_list[i]
+                output_len = proposed_len + 1
+                end_loc = start_loc + output_len
+                all_tokens[
+                    i, :output_len] = target_token_ids[start_loc:end_loc]
+                all_probs[i, :output_len] = target_probs[start_loc:end_loc]
+                all_logprobs[
+                    i, :output_len] = target_logprobs[start_loc:end_loc]
+                start_loc = end_loc
+                i += 1
 
         hidden_states = None
         if target_sampler_output.hidden_states is not None:
@@ -110,4 +155,5 @@ class MQAScorer(SpeculativeScorer):
         return SpeculativeScores(probs=all_probs,
                                  token_ids=all_tokens,
                                  logprobs=all_logprobs,
-                                 hidden_states=hidden_states)
+                                 hidden_states=hidden_states,
+                                 prompt_logprobs=prompt_logprobs)
diff --git a/vllm/spec_decode/multi_step_worker.py b/vllm/spec_decode/multi_step_worker.py
index 32197f8cc8f2f..5474917a6fab7 100644
--- a/vllm/spec_decode/multi_step_worker.py
+++ b/vllm/spec_decode/multi_step_worker.py
@@ -1,3 +1,5 @@
+# SPDX-License-Identifier: Apache-2.0
+
 import copy
 import weakref
 from typing import Dict, List, Set, Tuple
diff --git a/vllm/spec_decode/ngram_worker.py b/vllm/spec_decode/ngram_worker.py
index e906b1789cde8..86390c99c2fbc 100644
--- a/vllm/spec_decode/ngram_worker.py
+++ b/vllm/spec_decode/ngram_worker.py
@@ -1,3 +1,5 @@
+# SPDX-License-Identifier: Apache-2.0
+
 import weakref
 from typing import List, Optional, Set, Tuple
 
diff --git a/vllm/spec_decode/proposer_worker_base.py b/vllm/spec_decode/proposer_worker_base.py
index 28a537593f26d..2bebf80fadae5 100644
--- a/vllm/spec_decode/proposer_worker_base.py
+++ b/vllm/spec_decode/proposer_worker_base.py
@@ -1,3 +1,5 @@
+# SPDX-License-Identifier: Apache-2.0
+
 from abc import ABC, abstractmethod
 from typing import List, Optional, Set, Tuple
 
diff --git a/vllm/spec_decode/smaller_tp_proposer_worker.py b/vllm/spec_decode/smaller_tp_proposer_worker.py
index c6ff5e52f9388..a1466ba5db756 100644
--- a/vllm/spec_decode/smaller_tp_proposer_worker.py
+++ b/vllm/spec_decode/smaller_tp_proposer_worker.py
@@ -1,3 +1,5 @@
+# SPDX-License-Identifier: Apache-2.0
+
 from typing import List, Optional, Set, Tuple
 
 import torch
diff --git a/vllm/spec_decode/spec_decode_worker.py b/vllm/spec_decode/spec_decode_worker.py
index 0d66ede3d907a..8653bece8b5a5 100644
--- a/vllm/spec_decode/spec_decode_worker.py
+++ b/vllm/spec_decode/spec_decode_worker.py
@@ -1,3 +1,5 @@
+# SPDX-License-Identifier: Apache-2.0
+
 import copy
 from collections import defaultdict
 from functools import cached_property
@@ -510,8 +512,8 @@ class SpecDecodeWorker(LoraNotSupportedWorkerBase):
             self, execute_model_req: ExecuteModelRequest) -> bool:
         # When the batch size is too large, disable speculative decoding
         # to stop trading off throughput for latency.
-        return (execute_model_req.running_queue_size >=
-                self.disable_by_batch_size)
+        return (execute_model_req.running_queue_size
+                >= self.disable_by_batch_size)
 
     def _maybe_disable_speculative_tokens(
             self, disable_all_speculation: bool,
@@ -563,50 +565,57 @@ class SpecDecodeWorker(LoraNotSupportedWorkerBase):
             (seq_id, seq_data) for sg in \
             execute_model_req.seq_group_metadata_list \
             for seq_id, seq_data in sg.seq_data.items()
-            if sg.do_sample # ignore empty token sequences
         ]
         completion_seq_group_output_list: List[
             CompletionSequenceGroupOutput] = []
         output_index = 0
         # Make sure the non-terminal prefill chunks are still aligned with
         # their own empty output.
-        for seq_group_meta in execute_model_req.seq_group_metadata_list:
+        for idx, seq_group_meta in enumerate(
+                execute_model_req.seq_group_metadata_list):
+            needs_prompt_logprobs = seq_output_prompt_logprobs[idx]
+            seq_id, seq_data = seq_data_entries[idx]
+            if needs_prompt_logprobs:
+                prompt_token_ids = seq_data.get_prompt_token_ids()
+
+                # Some of these sequences may belong to non-terminal chunks,
+                # which may still have to report logprobs for prompts.
+                start = 1 if seq_data._num_computed_tokens == 0 \
+                    else seq_data._num_computed_tokens
+                end = (seq_data._num_computed_tokens + \
+                       seq_group_meta.token_chunk_size)
+                prompt_token_ids = prompt_token_ids[start:end]
+                prompt_logprobs = [
+                    create_logprobs_output(
+                        token_id=p_token_id,
+                        token_id_logprob_rank=-1,
+                        token_id_logprob=0.0,
+                        topk_token_ids=[],
+                        topk_logprobs=[],
+                    ) for p_token_id in prompt_token_ids
+                ]
+            else:
+                prompt_logprobs = None
+
             # Since we can get chunks here, we dont always have a sampled token
             # (only on last chunk) but we still have to provide an output.
             if not seq_group_meta.do_sample:
                 completion_seq_group_output_list.append(
-                    CompletionSequenceGroupOutput(samples=[],
-                                                  prompt_logprobs=None))
-            else:
-                # Sequence with output.
-                seq_id, seq_data = seq_data_entries[output_index]
-                needs_prompt_logprobs = seq_output_prompt_logprobs[
-                    output_index]
-                if needs_prompt_logprobs:
-                    prompt_token_ids = seq_data.get_prompt_token_ids()
-                    prompt_logprobs = [
-                        create_logprobs_output(
-                            token_id=p_token_id,
-                            token_id_logprob_rank=-1,
-                            token_id_logprob=0.0,
-                            topk_token_ids=[],
-                            topk_logprobs=[],
-                        )
-                        # no prompt logprobs for the first token
-                        for p_token_id in prompt_token_ids[1:]
-                    ]
-                else:
-                    prompt_logprobs = None
-                completion_seq_group_output_list.append(
-                    create_sequence_group_output(
-                        token_id=sampled_token_ids_list[output_index][0],
-                        token_id_logprob_rank=-1,
-                        token_id_logprob=0.0,
-                        seq_id=seq_id,
-                        topk_token_ids=[],
-                        topk_logprobs=[],
-                        prompt_logprobs=prompt_logprobs))
-                output_index += 1
+                    CompletionSequenceGroupOutput(
+                        samples=[], prompt_logprobs=prompt_logprobs))
+                continue
+
+            # Sequence with output.
+            completion_seq_group_output_list.append(
+                create_sequence_group_output(
+                    token_id=sampled_token_ids_list[output_index][0],
+                    token_id_logprob_rank=-1,
+                    token_id_logprob=0.0,
+                    seq_id=seq_id,
+                    topk_token_ids=[],
+                    topk_logprobs=[],
+                    prompt_logprobs=prompt_logprobs))
+            output_index += 1
 
         return [SamplerOutput(outputs=completion_seq_group_output_list)]
 
@@ -624,24 +633,27 @@ class SpecDecodeWorker(LoraNotSupportedWorkerBase):
         assert len(sampler_output) == 1
         sampler_output = sampler_output[0]
 
-        # Store hidden states from target model execution.
+        # Store hidden states from target model execution, BxD.
         hidden_states = sampler_output.hidden_states
         if hidden_states is not None:
-            # remove hidden_states for prompt tokens
-            # TODO Enable `return_hidden_states`: prefill chunks hidden states
-            # are pruned by the logits processor. Also, they should be arranged
-            # back into full-prefill latent. Address it to enable MLPSpeculator.
-            if any(seq.is_prompt
-                   for seq in execute_model_req.seq_group_metadata_list):
+            # Only decodes and prefill terminal chunks need a hidden state.
+            seq_group_meta_with_hidden = [
+                sg for sg in execute_model_req.seq_group_metadata_list
+                if sg.do_sample
+            ]
+            if any(seq.is_prompt for seq in seq_group_meta_with_hidden):
+                # Drop hidden_states with no prediction (eg non-terminal chunks)
                 hidden_states = hidden_states[
                     torch.where(sampler_output.sampled_token_ids -
                                 VLLM_INVALID_TOKEN_ID)[0]]
-            if self.previous_hidden_states is None:
+            if self.previous_hidden_states is None and len(
+                    seq_group_meta_with_hidden):
                 self.previous_hidden_states = HiddenStates(
-                    hidden_states, execute_model_req.seq_group_metadata_list)
-            else:
-                self.previous_hidden_states.update(
-                    hidden_states, execute_model_req.seq_group_metadata_list)
+                    hidden_states, seq_group_meta_with_hidden)
+            elif self.previous_hidden_states and len(
+                    seq_group_meta_with_hidden):
+                self.previous_hidden_states.update(hidden_states,
+                                                   seq_group_meta_with_hidden)
 
         if not skip_proposer:
             # We prepare the prefill hidden states here so that there no
@@ -752,13 +764,13 @@ class SpecDecodeWorker(LoraNotSupportedWorkerBase):
         ]
         if len(non_spec_indices):
             all_hidden_states = proposal_scores.hidden_states
-            # TODO fix `return_hidden_states`, same as in `_run_no_spec`
             if all_hidden_states is not None:
                 prefill_hidden_states = all_hidden_states[non_spec_indices]
                 execute_model_req.previous_hidden_states = \
                     prepare_prefill_hidden_states(prefill_hidden_states)
             # Sync proposer KV cache for prefills.
             prefill_req = execute_model_req.clone(non_spec_seqs)
+            # TODO avoid sampling here?
             self.proposer_worker.execute_model(prefill_req)
 
         with Timer() as verification_timer:
@@ -774,6 +786,8 @@ class SpecDecodeWorker(LoraNotSupportedWorkerBase):
             execute_model_req.seq_group_metadata_list,
             accepted_token_ids,
             target_logprobs=target_logprobs,
+            prompt_logprobs=proposal_scores.prompt_logprobs
+            if not self._disable_logprobs else None,
             k=execute_model_req.num_lookahead_slots,
             stage_times=stage_times)
 
@@ -845,19 +859,32 @@ class SpecDecodeWorker(LoraNotSupportedWorkerBase):
         # metadata.
         accepted_token_ids[original_indices] = accepted_token_ids.clone()
 
+        # B x K+1 x D
         hidden_states = proposal_scores.hidden_states
         if hidden_states is not None:
+            # Only get terminal hidden states for next step
+            terminal_metadata = [
+                sg for sg in seq_group_metadata_list if sg.do_sample
+            ]
+
             # Contract hidden states based on accepted tokens
             hs_size = hidden_states.shape[-1]
-
             accepted_index = accepted_token_ids + 1  # Convert -1 to 0
-            accepted_index = accepted_index.count_nonzero(dim=1).add_(-1)
-            index = accepted_index[:, None, None].expand(-1, 1, hs_size)
+            accepted_index = accepted_index.count_nonzero(dim=1).add_(-1)  # b
+            # Drop non-terminal prefill chunks hidden states.
+            hidden_states = hidden_states[accepted_index !=
+                                          VLLM_INVALID_TOKEN_ID]
+            accepted_index = accepted_index[accepted_index !=
+                                            VLLM_INVALID_TOKEN_ID]
+            assert len(accepted_index) == hidden_states.shape[0] == len(
+                terminal_metadata)
+            index = accepted_index[:, None, None].expand(-1, 1,
+                                                         hs_size)  # b x 1 x d
             second_last_token_hidden_states = hidden_states[:, -2]  # b x d
             hidden_states = hidden_states.gather(1, index).squeeze(1)  # b x d
             # Store hidden states from target model for subsequent decode step
             self.previous_hidden_states = HiddenStates(
-                hidden_states, seq_group_metadata_list,
+                hidden_states, terminal_metadata,
                 second_last_token_hidden_states)
         return accepted_token_ids, logprobs
 
@@ -866,6 +893,8 @@ class SpecDecodeWorker(LoraNotSupportedWorkerBase):
         seq_group_metadata_list: List[SequenceGroupMetadata],
         accepted_token_ids: torch.Tensor,  # shape: [batch_size, k+1]
         target_logprobs: torch.Tensor,  # shape: [batch_size, k+1, vocab_size]
+        prompt_logprobs: Optional[
+            torch.Tensor],  # shape: [nprompt_tokens, vocab_size]
         k: int,
         stage_times: Tuple[float, float, float],
     ) -> List[SamplerOutput]:
@@ -909,15 +938,89 @@ class SpecDecodeWorker(LoraNotSupportedWorkerBase):
 
         # Construct the output on a per-step, per-sequence basis.
         # Non-terminal prefill chunks will end up here as rows with just -1s
-        # i.e mixed-batch [[-1, 1576], [-1, 29884], [-1, -1], [-1, -1]]
+        # i.e mixed-batch [[-1, 1576], [-1, 29884], [-1, -1], [-1, -1]] while
+        # terminal chunks will only have one generated token at time 0.
         sampler_output_list: List[SamplerOutput] = []
+
+        # Prefills are not multi-step (return at most 1 token), in order to
+        # avoid padding or repetition to fit decodes, we separate them.
+        for i, sg in enumerate(seq_group_metadata_list):
+            if not sg.is_prompt:
+                # Requests are ordered as prefills|decodes=>no more prefills.
+                break
+            num_logprobs = num_logprobs_per_seq[i]
+            seq_kwargs = dict(token_id=-1,
+                              token_id_logprob_rank=0,
+                              token_id_logprob=-float('inf'),
+                              topk_token_ids=[-1] * num_logprobs,
+                              topk_logprobs=[-float('inf')] * num_logprobs,
+                              seq_id=seq_ids[i])
+            # Terminal chunk, has token.
+            if sg.do_sample:
+                seq_kwargs.update(
+                    dict(
+                        token_id=accepted_token_ids[i][0].item(),
+                        token_id_logprob_rank=accepted_token_id_ranks_by_step[
+                            0][i],
+                        token_id_logprob=accepted_token_id_logprobs_by_step[0]
+                        [i],
+                        topk_token_ids=topk_indices_by_step[0][i]
+                        [:num_logprobs],
+                        # output only so step is 0
+                        topk_logprobs=topk_logprobs_by_step[0][i]
+                        [:num_logprobs],
+                    ))
+            needs_plogs = (sg.sampling_params.prompt_logprobs
+                           and sg.sampling_params.prompt_logprobs > 0)
+            plogs = None
+            if prompt_logprobs is not None:
+                # Even non-terminal prompt chunks can have logprobs here.
+                plogs = prompt_logprobs[i]
+            elif needs_plogs:
+                # Prompt logprobs are requested but `_disable_logprobs` is set.
+                seq_data = next(iter(sg.seq_data.values()))
+                # Get only the tokens in this chunk!
+                prompt_token_ids = seq_data.get_prompt_token_ids()
+                prompt_token_ids = prompt_token_ids[
+                    seq_data.
+                    _num_computed_tokens:seq_data._num_computed_tokens +
+                    sg.token_chunk_size]
+
+                is_first_chunk = seq_data._num_computed_tokens == 0
+                # There's no prob generated for the first token in a sequence.
+                if is_first_chunk:
+                    prompt_token_ids = prompt_token_ids[1:]
+                plogs = [
+                    create_logprobs_output(
+                        token_id=p_token_id,
+                        token_id_logprob_rank=-1,
+                        token_id_logprob=0.0,
+                        topk_token_ids=[],
+                        topk_logprobs=[],
+                    ) for p_token_id in prompt_token_ids
+                ]
+            seq_kwargs.update(dict(prompt_logprobs=plogs))
+
+            sampler_output_list.append(
+                SamplerOutput(
+                    outputs=[create_sequence_group_output(
+                        **seq_kwargs)]))  # type: ignore
+
+        # Decodes, create one SamplerOutput per-step (at most K+1).
         for step_index in range(num_steps):
-            if all(token_id == -1
-                   for token_id in accepted_token_ids_by_step[step_index]):
+            if all(token_id == -1 for sg, token_id in zip(
+                    seq_group_metadata_list,
+                    accepted_token_ids_by_step[step_index])
+                   if not sg.is_prompt):
                 break
 
             step_output_token_ids: List[CompletionSequenceGroupOutput] = []
             for sequence_index in range(batch_size):
+                seq_meta = seq_group_metadata_list[sequence_index]
+                # Prompts already processed above.
+                if seq_meta.is_prompt:
+                    continue
+
                 # Each sequence may have a different num_logprobs; retrieve it.
                 num_logprobs = num_logprobs_per_seq[sequence_index]
                 step_output_token_ids.append(
@@ -952,6 +1055,8 @@ class SpecDecodeWorker(LoraNotSupportedWorkerBase):
             # This is periodic because the rejection sampler emits metrics
             # periodically.
             self._maybe_log_stage_times(*stage_times)
+        # First `n_prefills` entries will contain prefills SamplerOutput when
+        # chunked prefill is enabled, the rest is decodes in multi-step format.
         return sampler_output_list
 
     def _maybe_log_stage_times(self, average_time_per_proposal_tok_ms: float,
diff --git a/vllm/spec_decode/target_model_runner.py b/vllm/spec_decode/target_model_runner.py
index 56540744b73a9..08e773c562bf8 100644
--- a/vllm/spec_decode/target_model_runner.py
+++ b/vllm/spec_decode/target_model_runner.py
@@ -1,3 +1,5 @@
+# SPDX-License-Identifier: Apache-2.0
+
 from typing import List, Optional
 
 from vllm.sequence import SequenceGroupMetadata
diff --git a/vllm/spec_decode/top1_proposer.py b/vllm/spec_decode/top1_proposer.py
index 5a7999a258b2d..b538923c03e74 100644
--- a/vllm/spec_decode/top1_proposer.py
+++ b/vllm/spec_decode/top1_proposer.py
@@ -1,3 +1,5 @@
+# SPDX-License-Identifier: Apache-2.0
+
 from typing import List, Optional, Set, Tuple
 
 import torch
@@ -104,11 +106,11 @@ class Top1Proposer(SpeculativeProposer):
             sampler_transposed=transposed,
         )
 
-        proposals = SpeculativeProposals(
-            proposal_token_ids=proposal_tokens,
-            proposal_probs=proposal_probs,
-            proposal_lens=proposal_lens,
-            no_proposals=maybe_sampler_output is None)
+        proposals = SpeculativeProposals(proposal_token_ids=proposal_tokens,
+                                         proposal_probs=proposal_probs,
+                                         proposal_lens=proposal_lens,
+                                         no_proposals=maybe_sampler_output
+                                         is None)
         return proposals
 
     def _split_by_proposal_len(
diff --git a/vllm/spec_decode/util.py b/vllm/spec_decode/util.py
index da8706658d09a..9c04680a6a7ab 100644
--- a/vllm/spec_decode/util.py
+++ b/vllm/spec_decode/util.py
@@ -1,3 +1,5 @@
+# SPDX-License-Identifier: Apache-2.0
+
 import time
 from contextlib import contextmanager
 from typing import Dict, List, Optional, Sequence, Tuple
@@ -40,13 +42,15 @@ def get_sampled_token_logprobs(
     """
     num_steps, batch_size, vocab_size = logprob_tensor.shape
 
-    selected_logprobs = logprob_tensor[torch.arange(num_steps).unsqueeze(1),
-                                       torch.arange(batch_size),
-                                       sampled_token_ids, ]
+    selected_logprobs = logprob_tensor[
+        torch.arange(num_steps).unsqueeze(1),
+        torch.arange(batch_size),
+        sampled_token_ids,
+    ]
     expanded_selected_logprobs = selected_logprobs.unsqueeze(-1).expand(
         -1, -1, vocab_size)
-    sampled_token_ids_ranks = (logprob_tensor >
-                               expanded_selected_logprobs).sum(-1).add_(1)
+    sampled_token_ids_ranks = (logprob_tensor
+                               > expanded_selected_logprobs).sum(-1).add_(1)
 
     return sampled_token_ids_ranks, selected_logprobs
 
diff --git a/vllm/tracing.py b/vllm/tracing.py
index 72a3f85118d36..bf069ad84fd42 100644
--- a/vllm/tracing.py
+++ b/vllm/tracing.py
@@ -1,3 +1,5 @@
+# SPDX-License-Identifier: Apache-2.0
+
 import os
 from typing import Mapping, Optional
 
diff --git a/vllm/transformers_utils/__init__.py b/vllm/transformers_utils/__init__.py
index eeec029fc051a..01d5bb4b57489 100644
--- a/vllm/transformers_utils/__init__.py
+++ b/vllm/transformers_utils/__init__.py
@@ -1,3 +1,5 @@
+# SPDX-License-Identifier: Apache-2.0
+
 from vllm.envs import VLLM_USE_MODELSCOPE
 
 if VLLM_USE_MODELSCOPE:
diff --git a/vllm/transformers_utils/config.py b/vllm/transformers_utils/config.py
index c97acffa1a719..1c0f20a6e045b 100644
--- a/vllm/transformers_utils/config.py
+++ b/vllm/transformers_utils/config.py
@@ -1,3 +1,5 @@
+# SPDX-License-Identifier: Apache-2.0
+
 import enum
 import json
 import os
@@ -7,7 +9,8 @@ from typing import Any, Dict, Optional, Type, Union
 import huggingface_hub
 from huggingface_hub import (file_exists, hf_hub_download,
                              try_to_load_from_cache)
-from huggingface_hub.utils import (EntryNotFoundError, LocalEntryNotFoundError,
+from huggingface_hub.utils import (EntryNotFoundError, HfHubHTTPError,
+                                   LocalEntryNotFoundError,
                                    RepositoryNotFoundError,
                                    RevisionNotFoundError)
 from torch import nn
@@ -294,6 +297,13 @@ def get_hf_file_to_dict(file_name: str,
                 logger.debug("File or repository not found in hf_hub_download",
                              e)
                 return None
+            except HfHubHTTPError as e:
+                logger.warning(
+                    "Cannot connect to Hugging Face Hub. Skipping file "
+                    "download for '%s':",
+                    file_name,
+                    exc_info=e)
+                return None
             file_path = Path(hf_hub_file)
 
         with open(file_path) as file:
diff --git a/vllm/transformers_utils/configs/__init__.py b/vllm/transformers_utils/configs/__init__.py
index f065c56124605..c484a755ab4ec 100644
--- a/vllm/transformers_utils/configs/__init__.py
+++ b/vllm/transformers_utils/configs/__init__.py
@@ -1,3 +1,5 @@
+# SPDX-License-Identifier: Apache-2.0
+
 from vllm.transformers_utils.configs.chatglm import ChatGLMConfig
 from vllm.transformers_utils.configs.cohere2 import Cohere2Config
 from vllm.transformers_utils.configs.dbrx import DbrxConfig
diff --git a/vllm/transformers_utils/configs/arctic.py b/vllm/transformers_utils/configs/arctic.py
index 7780bf5e78d6d..6625ccf0f2a84 100644
--- a/vllm/transformers_utils/configs/arctic.py
+++ b/vllm/transformers_utils/configs/arctic.py
@@ -1,3 +1,5 @@
+# SPDX-License-Identifier: Apache-2.0
+
 # yapf: disable
 # ruff: noqa: E501
 # coding=utf-8
diff --git a/vllm/transformers_utils/configs/chatglm.py b/vllm/transformers_utils/configs/chatglm.py
index e563bf6268d72..43e9503ffe03f 100644
--- a/vllm/transformers_utils/configs/chatglm.py
+++ b/vllm/transformers_utils/configs/chatglm.py
@@ -1,3 +1,5 @@
+# SPDX-License-Identifier: Apache-2.0
+
 # Adapted from
 # https://github.com/THUDM/ChatGLM2-6B
 from transformers import PretrainedConfig
diff --git a/vllm/transformers_utils/configs/cohere2.py b/vllm/transformers_utils/configs/cohere2.py
index 1509330fc2179..e30409b3af5f0 100644
--- a/vllm/transformers_utils/configs/cohere2.py
+++ b/vllm/transformers_utils/configs/cohere2.py
@@ -1,3 +1,5 @@
+# SPDX-License-Identifier: Apache-2.0
+
 # ruff: noqa
 
 # Adapted from
diff --git a/vllm/transformers_utils/configs/dbrx.py b/vllm/transformers_utils/configs/dbrx.py
index 0dc9664723d34..8f40b2b7df7ab 100644
--- a/vllm/transformers_utils/configs/dbrx.py
+++ b/vllm/transformers_utils/configs/dbrx.py
@@ -1,3 +1,5 @@
+# SPDX-License-Identifier: Apache-2.0
+
 # yapf: disable
 # ruff: noqa: E501
 # coding=utf-8
diff --git a/vllm/transformers_utils/configs/deepseek_vl2.py b/vllm/transformers_utils/configs/deepseek_vl2.py
index 681528c3c0116..24d4052d87211 100644
--- a/vllm/transformers_utils/configs/deepseek_vl2.py
+++ b/vllm/transformers_utils/configs/deepseek_vl2.py
@@ -1,3 +1,5 @@
+# SPDX-License-Identifier: Apache-2.0
+
 # adapted from https://github.com/deepseek-ai/DeepSeek-VL2/blob/faf18023f24b962b32d9f0a2d89e402a8d383a78/deepseek_vl2/models/modeling_deepseek_vl_v2.py#L115-L268
 from typing import Tuple
 
diff --git a/vllm/transformers_utils/configs/eagle.py b/vllm/transformers_utils/configs/eagle.py
index b357a785e4dc4..b26aba66699fd 100644
--- a/vllm/transformers_utils/configs/eagle.py
+++ b/vllm/transformers_utils/configs/eagle.py
@@ -1,3 +1,5 @@
+# SPDX-License-Identifier: Apache-2.0
+
 import os
 from typing import Optional, Union
 
diff --git a/vllm/transformers_utils/configs/exaone.py b/vllm/transformers_utils/configs/exaone.py
index f60a59f554133..39364367e3031 100644
--- a/vllm/transformers_utils/configs/exaone.py
+++ b/vllm/transformers_utils/configs/exaone.py
@@ -1,3 +1,5 @@
+# SPDX-License-Identifier: Apache-2.0
+
 # Copied from
 # https://huggingface.co/LGAI-EXAONE/EXAONE-3.0-7.8B-Instruct/blob/main/configuration_exaone.py
 # Copyright 2021 The LG AI Research EXAONE Lab. All rights reserved.
diff --git a/vllm/transformers_utils/configs/falcon.py b/vllm/transformers_utils/configs/falcon.py
index c82cc6065c7ea..f161a06f34238 100644
--- a/vllm/transformers_utils/configs/falcon.py
+++ b/vllm/transformers_utils/configs/falcon.py
@@ -1,3 +1,5 @@
+# SPDX-License-Identifier: Apache-2.0
+
 # Adapted from
 # https://huggingface.co/tiiuae/falcon-7b/blob/main/configuration_RW.py
 # Copyright 2023 The vLLM team.
diff --git a/vllm/transformers_utils/configs/h2ovl.py b/vllm/transformers_utils/configs/h2ovl.py
index b94c5b77e4b7f..48b5d79ff950b 100644
--- a/vllm/transformers_utils/configs/h2ovl.py
+++ b/vllm/transformers_utils/configs/h2ovl.py
@@ -1,3 +1,5 @@
+# SPDX-License-Identifier: Apache-2.0
+
 # Adapted from
 # https://huggingface.co/h2oai/h2ovl-mississippi-2b/blob/main/configuration_h2ovl_chat.py
 # --------------------------------------------------------
diff --git a/vllm/transformers_utils/configs/internvl.py b/vllm/transformers_utils/configs/internvl.py
index ac2492317aa36..8ea62546e2133 100644
--- a/vllm/transformers_utils/configs/internvl.py
+++ b/vllm/transformers_utils/configs/internvl.py
@@ -1,3 +1,5 @@
+# SPDX-License-Identifier: Apache-2.0
+
 # Adapted from
 # https://huggingface.co/OpenGVLab/InternVL2-1B/blob/main/configuration_internvl_chat.py
 # --------------------------------------------------------
diff --git a/vllm/transformers_utils/configs/jais.py b/vllm/transformers_utils/configs/jais.py
index 82f129eb2018e..0cab2c42e5791 100644
--- a/vllm/transformers_utils/configs/jais.py
+++ b/vllm/transformers_utils/configs/jais.py
@@ -1,3 +1,5 @@
+# SPDX-License-Identifier: Apache-2.0
+
 # Copyright 2023 The OpenAI Team Authors and HuggingFace Inc. team.
 # Copyright (c) 2018, NVIDIA CORPORATION.  All rights reserved.
 # Copyright 2023 Cerebras Systems.
diff --git a/vllm/transformers_utils/configs/medusa.py b/vllm/transformers_utils/configs/medusa.py
index d71a08343be2a..885713c5d6cd0 100644
--- a/vllm/transformers_utils/configs/medusa.py
+++ b/vllm/transformers_utils/configs/medusa.py
@@ -1,3 +1,5 @@
+# SPDX-License-Identifier: Apache-2.0
+
 import os
 from typing import Optional, Union
 
diff --git a/vllm/transformers_utils/configs/mllama.py b/vllm/transformers_utils/configs/mllama.py
index 49e766d7fa1f4..eb77e09adca48 100644
--- a/vllm/transformers_utils/configs/mllama.py
+++ b/vllm/transformers_utils/configs/mllama.py
@@ -1,3 +1,5 @@
+# SPDX-License-Identifier: Apache-2.0
+
 from transformers.models.mllama import configuration_mllama as mllama_hf_config
 
 
diff --git a/vllm/transformers_utils/configs/mlp_speculator.py b/vllm/transformers_utils/configs/mlp_speculator.py
index 946af4e919f7c..c761f659e5b2c 100644
--- a/vllm/transformers_utils/configs/mlp_speculator.py
+++ b/vllm/transformers_utils/configs/mlp_speculator.py
@@ -1,3 +1,5 @@
+# SPDX-License-Identifier: Apache-2.0
+
 from typing import List, Optional
 
 from transformers import PretrainedConfig
diff --git a/vllm/transformers_utils/configs/mpt.py b/vllm/transformers_utils/configs/mpt.py
index 0f047c8b0361c..96356135f6b28 100644
--- a/vllm/transformers_utils/configs/mpt.py
+++ b/vllm/transformers_utils/configs/mpt.py
@@ -1,3 +1,5 @@
+# SPDX-License-Identifier: Apache-2.0
+
 # Copied from
 # https://huggingface.co/mosaicml/mpt-7b/blob/main/configuration_mpt.py
 """A HuggingFace-style model configuration."""
diff --git a/vllm/transformers_utils/configs/nemotron.py b/vllm/transformers_utils/configs/nemotron.py
index 93fec667d1cf3..fdf4fa2a53e57 100644
--- a/vllm/transformers_utils/configs/nemotron.py
+++ b/vllm/transformers_utils/configs/nemotron.py
@@ -1,3 +1,5 @@
+# SPDX-License-Identifier: Apache-2.0
+
 # Copyright 2024 HuggingFace Inc. team. All rights reserved.
 # Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved.
 #
@@ -182,8 +184,8 @@ class NemotronConfig(PretrainedConfig):
         if self.rope_scaling is None:
             return
 
-        if not isinstance(self.rope_scaling,
-                          dict) or len(self.rope_scaling) != 2:
+        if not isinstance(self.rope_scaling, dict) or len(
+                self.rope_scaling) != 2:
             raise ValueError(
                 "`rope_scaling` must be a dictionary with two fields, "
                 f"`type` and `factor`, got {self.rope_scaling}")
diff --git a/vllm/transformers_utils/configs/nvlm_d.py b/vllm/transformers_utils/configs/nvlm_d.py
index 8007176aecd90..300f6e21168e5 100644
--- a/vllm/transformers_utils/configs/nvlm_d.py
+++ b/vllm/transformers_utils/configs/nvlm_d.py
@@ -1,3 +1,5 @@
+# SPDX-License-Identifier: Apache-2.0
+
 # Adapted from
 # https://huggingface.co/nvidia/NVLM-D-72B/blob/main/configuration_nvlm_d.py
 # --------------------------------------------------------
diff --git a/vllm/transformers_utils/configs/olmo2.py b/vllm/transformers_utils/configs/olmo2.py
index 0e6d8e4879b06..c6e446333b43d 100644
--- a/vllm/transformers_utils/configs/olmo2.py
+++ b/vllm/transformers_utils/configs/olmo2.py
@@ -1,3 +1,5 @@
+# SPDX-License-Identifier: Apache-2.0
+
 # yapf: disable
 # ruff: noqa: E501
 # coding=utf-8
diff --git a/vllm/transformers_utils/configs/solar.py b/vllm/transformers_utils/configs/solar.py
index 0c1c048f670ee..0d5db896b93d3 100644
--- a/vllm/transformers_utils/configs/solar.py
+++ b/vllm/transformers_utils/configs/solar.py
@@ -1,3 +1,5 @@
+# SPDX-License-Identifier: Apache-2.0
+
 # Copyright 2022 EleutherAI and the HuggingFace Inc. team. All rights reserved.
 #
 # This code is based on EleutherAI's GPT-NeoX library and the GPT-NeoX
diff --git a/vllm/transformers_utils/configs/telechat2.py b/vllm/transformers_utils/configs/telechat2.py
index eb6f5a059169f..5da6c5b4427ea 100644
--- a/vllm/transformers_utils/configs/telechat2.py
+++ b/vllm/transformers_utils/configs/telechat2.py
@@ -1,3 +1,5 @@
+# SPDX-License-Identifier: Apache-2.0
+
 # adapted from https://www.modelscope.cn/models/TeleAI/TeleChat2-3B/resolve/master/configuration_telechat2.py
 """ Telechat configuration compatible with LlamaConfig. """
 
diff --git a/vllm/transformers_utils/configs/ultravox.py b/vllm/transformers_utils/configs/ultravox.py
index f724bf7f2f1cd..99715ba6d0b09 100644
--- a/vllm/transformers_utils/configs/ultravox.py
+++ b/vllm/transformers_utils/configs/ultravox.py
@@ -1,3 +1,5 @@
+# SPDX-License-Identifier: Apache-2.0
+
 # Adapted from https://github.com/fixie-ai/ultravox/blob/ecd58c4041030bae2ad15aa6bcf04ab43199ea02/ultravox/model/ultravox_config.py
 from typing import Any, Dict, Optional
 
diff --git a/vllm/transformers_utils/detokenizer.py b/vllm/transformers_utils/detokenizer.py
index 7c8423d2b0a34..9d1d4bb92e4ab 100644
--- a/vllm/transformers_utils/detokenizer.py
+++ b/vllm/transformers_utils/detokenizer.py
@@ -1,3 +1,5 @@
+# SPDX-License-Identifier: Apache-2.0
+
 from typing import Dict, List, Optional
 
 from vllm.sequence import (VLLM_INVALID_TOKEN_ID, Logprob, SamplingParams,
diff --git a/vllm/transformers_utils/detokenizer_utils.py b/vllm/transformers_utils/detokenizer_utils.py
index 37ff8a236e791..8160a35ff2228 100644
--- a/vllm/transformers_utils/detokenizer_utils.py
+++ b/vllm/transformers_utils/detokenizer_utils.py
@@ -1,3 +1,5 @@
+# SPDX-License-Identifier: Apache-2.0
+
 from typing import List, Optional, Tuple
 
 from .tokenizer import AnyTokenizer
diff --git a/vllm/transformers_utils/processor.py b/vllm/transformers_utils/processor.py
index b12cc83a22970..3197b07d8a468 100644
--- a/vllm/transformers_utils/processor.py
+++ b/vllm/transformers_utils/processor.py
@@ -1,3 +1,5 @@
+# SPDX-License-Identifier: Apache-2.0
+
 from functools import lru_cache
 from typing import Any, cast
 
diff --git a/vllm/transformers_utils/processors/__init__.py b/vllm/transformers_utils/processors/__init__.py
index 9c71b8cada32e..4696f0c49df96 100644
--- a/vllm/transformers_utils/processors/__init__.py
+++ b/vllm/transformers_utils/processors/__init__.py
@@ -1,3 +1,5 @@
+# SPDX-License-Identifier: Apache-2.0
+
 from vllm.transformers_utils.processors.deepseek_vl2 import (
     DeepseekVLV2Processor)
 
diff --git a/vllm/transformers_utils/processors/deepseek_vl2.py b/vllm/transformers_utils/processors/deepseek_vl2.py
index 27cdf6bc22d0e..d37381ea9925f 100644
--- a/vllm/transformers_utils/processors/deepseek_vl2.py
+++ b/vllm/transformers_utils/processors/deepseek_vl2.py
@@ -1,3 +1,5 @@
+# SPDX-License-Identifier: Apache-2.0
+
 # yapf: disable
 # ruff: noqa: E501
 # coding=utf-8
diff --git a/vllm/transformers_utils/s3_utils.py b/vllm/transformers_utils/s3_utils.py
index 74a56cbf57ec3..4fe744d285d35 100644
--- a/vllm/transformers_utils/s3_utils.py
+++ b/vllm/transformers_utils/s3_utils.py
@@ -1,3 +1,5 @@
+# SPDX-License-Identifier: Apache-2.0
+
 import fnmatch
 import os
 import shutil
diff --git a/vllm/transformers_utils/tokenizer.py b/vllm/transformers_utils/tokenizer.py
index 1f1d67fabb243..520870b563c9e 100644
--- a/vllm/transformers_utils/tokenizer.py
+++ b/vllm/transformers_utils/tokenizer.py
@@ -1,3 +1,5 @@
+# SPDX-License-Identifier: Apache-2.0
+
 import contextlib
 import os
 import warnings
diff --git a/vllm/transformers_utils/tokenizer_group/__init__.py b/vllm/transformers_utils/tokenizer_group/__init__.py
index 09569c564a58d..c223768b16d6b 100644
--- a/vllm/transformers_utils/tokenizer_group/__init__.py
+++ b/vllm/transformers_utils/tokenizer_group/__init__.py
@@ -1,3 +1,5 @@
+# SPDX-License-Identifier: Apache-2.0
+
 from typing import Optional, Type
 
 from vllm.config import (LoRAConfig, ModelConfig, ParallelConfig,
diff --git a/vllm/transformers_utils/tokenizer_group/base_tokenizer_group.py b/vllm/transformers_utils/tokenizer_group/base_tokenizer_group.py
index e6cc7cd4e2e3a..fbdfa3e57e172 100644
--- a/vllm/transformers_utils/tokenizer_group/base_tokenizer_group.py
+++ b/vllm/transformers_utils/tokenizer_group/base_tokenizer_group.py
@@ -1,3 +1,5 @@
+# SPDX-License-Identifier: Apache-2.0
+
 from abc import ABC, abstractmethod
 from typing import List, Optional
 
diff --git a/vllm/transformers_utils/tokenizer_group/ray_tokenizer_group.py b/vllm/transformers_utils/tokenizer_group/ray_tokenizer_group.py
index 3f7627e11ae5e..30cab752ccf3c 100644
--- a/vllm/transformers_utils/tokenizer_group/ray_tokenizer_group.py
+++ b/vllm/transformers_utils/tokenizer_group/ray_tokenizer_group.py
@@ -1,3 +1,5 @@
+# SPDX-License-Identifier: Apache-2.0
+
 import asyncio
 import os
 from typing import List, Optional
diff --git a/vllm/transformers_utils/tokenizer_group/tokenizer_group.py b/vllm/transformers_utils/tokenizer_group/tokenizer_group.py
index 6dc2f90561873..025971cb7e477 100644
--- a/vllm/transformers_utils/tokenizer_group/tokenizer_group.py
+++ b/vllm/transformers_utils/tokenizer_group/tokenizer_group.py
@@ -1,3 +1,5 @@
+# SPDX-License-Identifier: Apache-2.0
+
 from typing import List, Optional
 
 from vllm.config import TokenizerPoolConfig
diff --git a/vllm/transformers_utils/tokenizers/__init__.py b/vllm/transformers_utils/tokenizers/__init__.py
index e68ad79b296b8..2b64f3fc70569 100644
--- a/vllm/transformers_utils/tokenizers/__init__.py
+++ b/vllm/transformers_utils/tokenizers/__init__.py
@@ -1,3 +1,5 @@
+# SPDX-License-Identifier: Apache-2.0
+
 from .mistral import MistralTokenizer, maybe_serialize_tool_calls
 
 __all__ = ["MistralTokenizer", "maybe_serialize_tool_calls"]
diff --git a/vllm/transformers_utils/tokenizers/mistral.py b/vllm/transformers_utils/tokenizers/mistral.py
index d801cf4e4c7b1..1550f978ed201 100644
--- a/vllm/transformers_utils/tokenizers/mistral.py
+++ b/vllm/transformers_utils/tokenizers/mistral.py
@@ -1,3 +1,5 @@
+# SPDX-License-Identifier: Apache-2.0
+
 import os
 import re
 from dataclasses import dataclass
@@ -6,21 +8,18 @@ from typing import TYPE_CHECKING, Any, Dict, List, Optional, Union, cast
 
 import huggingface_hub
 from huggingface_hub import HfApi, hf_hub_download
-from mistral_common.protocol.instruct.request import ChatCompletionRequest
-from mistral_common.tokens.tokenizers.base import SpecialTokens
-# yapf: disable
-from mistral_common.tokens.tokenizers.mistral import (
-    MistralTokenizer as PublicMistralTokenizer)
-# yapf: enable
-from mistral_common.tokens.tokenizers.sentencepiece import (
-    SentencePieceTokenizer)
-from mistral_common.tokens.tokenizers.tekken import (SpecialTokenPolicy,
-                                                     Tekkenizer)
 
 from vllm.logger import init_logger
 from vllm.utils import is_list_of
 
 if TYPE_CHECKING:
+    # make sure `mistral_common` is lazy imported,
+    # so that users who only use non-mistral models
+    # will not be bothered by the dependency.
+    from mistral_common.protocol.instruct.request import ChatCompletionRequest
+    from mistral_common.tokens.tokenizers.mistral import (
+        MistralTokenizer as PublicMistralTokenizer)
+
     from vllm.entrypoints.chat_utils import ChatCompletionMessageParam
 
 logger = init_logger(__name__)
@@ -31,7 +30,7 @@ class Encoding:
     input_ids: Union[List[int], List[List[int]]]
 
 
-def maybe_serialize_tool_calls(request: ChatCompletionRequest):
+def maybe_serialize_tool_calls(request: "ChatCompletionRequest"):
     # SEE: https://github.com/vllm-project/vllm/pull/9951
     # Credits go to: @gcalmettes
     # NOTE: There is currently a bug in pydantic where attributes
@@ -106,12 +105,16 @@ def find_tokenizer_file(files: List[str]):
 
 class MistralTokenizer:
 
-    def __init__(self, tokenizer: PublicMistralTokenizer) -> None:
+    def __init__(self, tokenizer: "PublicMistralTokenizer") -> None:
         self.mistral = tokenizer
         self.instruct = tokenizer.instruct_tokenizer
 
         tokenizer_ = tokenizer.instruct_tokenizer.tokenizer
+        from mistral_common.tokens.tokenizers.tekken import (
+            SpecialTokenPolicy, Tekkenizer)
         self.is_tekken = isinstance(tokenizer_, Tekkenizer)
+        from mistral_common.tokens.tokenizers.sentencepiece import (
+            SentencePieceTokenizer)
         self.is_spm = isinstance(tokenizer_, SentencePieceTokenizer)
         if self.is_tekken:
             # Make sure special tokens will not raise
@@ -151,6 +154,8 @@ class MistralTokenizer:
             assert Path(
                 path_or_repo_id).is_file(), f"Invalid path: {path_or_repo_id}"
 
+        from mistral_common.tokens.tokenizers.mistral import (
+            MistralTokenizer as PublicMistralTokenizer)
         mistral_tokenizer = PublicMistralTokenizer.from_file(tokenizer_file)
         return cls(mistral_tokenizer)
 
@@ -179,6 +184,8 @@ class MistralTokenizer:
     # by the guided structured output backends.
     @property
     def all_special_tokens_extended(self) -> List[str]:
+        from mistral_common.tokens.tokenizers.base import SpecialTokens
+
         # tekken defines its own extended special tokens list
         if hasattr(self.tokenizer, "SPECIAL_TOKENS"):
             special_tokens = self.tokenizer.SPECIAL_TOKENS
@@ -282,6 +289,8 @@ class MistralTokenizer:
         if last_message["role"] == "assistant":
             last_message["prefix"] = True
 
+        from mistral_common.protocol.instruct.request import (
+            ChatCompletionRequest)
         request = ChatCompletionRequest(messages=messages,
                                         tools=tools)  # type: ignore[type-var]
         encoded = self.mistral.encode_chat_completion(request)
@@ -290,6 +299,7 @@ class MistralTokenizer:
         return encoded.tokens
 
     def convert_tokens_to_string(self, tokens: List[str]) -> str:
+        from mistral_common.tokens.tokenizers.base import SpecialTokens
         if self.is_tekken:
             tokens = [
                 t for t in tokens
@@ -361,6 +371,8 @@ class MistralTokenizer:
         ids: List[int],
         skip_special_tokens: bool = True,
     ) -> List[str]:
+        from mistral_common.tokens.tokenizers.base import SpecialTokens
+
         # TODO(Patrick) - potentially allow special tokens to not be skipped
         assert (
             skip_special_tokens
diff --git a/vllm/transformers_utils/utils.py b/vllm/transformers_utils/utils.py
index 10a09fb4f566c..71fe3ef0b23c5 100644
--- a/vllm/transformers_utils/utils.py
+++ b/vllm/transformers_utils/utils.py
@@ -1,3 +1,5 @@
+# SPDX-License-Identifier: Apache-2.0
+
 from os import PathLike
 from pathlib import Path
 from typing import Union
diff --git a/vllm/triton_utils/__init__.py b/vllm/triton_utils/__init__.py
index 568185383aa5c..c8f7a32ce7a8c 100644
--- a/vllm/triton_utils/__init__.py
+++ b/vllm/triton_utils/__init__.py
@@ -1,3 +1,5 @@
+# SPDX-License-Identifier: Apache-2.0
+
 from vllm.triton_utils.importing import HAS_TRITON
 
 __all__ = ["HAS_TRITON"]
diff --git a/vllm/triton_utils/custom_cache_manager.py b/vllm/triton_utils/custom_cache_manager.py
index 17039d7ba24c7..4163969c9a52d 100644
--- a/vllm/triton_utils/custom_cache_manager.py
+++ b/vllm/triton_utils/custom_cache_manager.py
@@ -1,3 +1,5 @@
+# SPDX-License-Identifier: Apache-2.0
+
 import os
 
 from triton.runtime.cache import (FileCacheManager, default_cache_dir,
diff --git a/vllm/triton_utils/importing.py b/vllm/triton_utils/importing.py
index 0c96e0632f646..a20700248c268 100644
--- a/vllm/triton_utils/importing.py
+++ b/vllm/triton_utils/importing.py
@@ -1,3 +1,5 @@
+# SPDX-License-Identifier: Apache-2.0
+
 from importlib.util import find_spec
 
 from vllm.logger import init_logger
diff --git a/vllm/usage/usage_lib.py b/vllm/usage/usage_lib.py
index 7f5cc906382af..fbbb21c89370a 100644
--- a/vllm/usage/usage_lib.py
+++ b/vllm/usage/usage_lib.py
@@ -1,3 +1,5 @@
+# SPDX-License-Identifier: Apache-2.0
+
 import datetime
 import json
 import logging
diff --git a/vllm/utils.py b/vllm/utils.py
index 17bffd2846b46..a2b53fcf252d1 100644
--- a/vllm/utils.py
+++ b/vllm/utils.py
@@ -1,3 +1,5 @@
+# SPDX-License-Identifier: Apache-2.0
+
 import argparse
 import asyncio
 import concurrent
@@ -29,7 +31,7 @@ from asyncio import FIRST_COMPLETED, AbstractEventLoop, Task
 from collections import OrderedDict, UserDict, defaultdict
 from collections.abc import Hashable, Iterable, Mapping
 from dataclasses import dataclass, field
-from functools import lru_cache, partial, wraps
+from functools import cache, lru_cache, partial, wraps
 from typing import (TYPE_CHECKING, Any, AsyncGenerator, Awaitable, Callable,
                     Dict, Generator, Generic, Iterator, List, Literal,
                     NamedTuple, Optional, Tuple, Type, TypeVar, Union,
@@ -352,7 +354,7 @@ class PyObjectCache:
         self._index = 0
 
 
-@lru_cache(maxsize=None)
+@cache
 def get_max_shared_memory_bytes(gpu: int = 0) -> int:
     """Returns the maximum shared memory per thread block in bytes."""
     from vllm import _custom_ops as ops
@@ -697,7 +699,7 @@ def create_kv_caches_with_random(
     return key_caches, value_caches
 
 
-@lru_cache(maxsize=None)
+@cache
 def is_pin_memory_available() -> bool:
     from vllm.platforms import current_platform
     return current_platform.is_pin_memory_available()
@@ -886,7 +888,7 @@ def init_cached_hf_modules() -> None:
     init_hf_modules()
 
 
-@lru_cache(maxsize=None)
+@cache
 def find_library(lib_name: str) -> str:
     """
     Find the library file in the system.
@@ -1607,7 +1609,7 @@ def import_from_path(module_name: str, file_path: Union[str, os.PathLike]):
     return module
 
 
-@lru_cache(maxsize=None)
+@cache
 def get_vllm_optional_dependencies():
     metadata = importlib.metadata.metadata("vllm")
     requirements = metadata.get_all("Requires-Dist", [])
@@ -2206,3 +2208,55 @@ def run_method(obj: Any, method: Union[str, bytes, Callable], args: Tuple[Any],
     else:
         func = partial(method, obj)  # type: ignore
     return func(*args, **kwargs)
+
+
+def import_pynvml():
+    """
+    Historical comments:
+
+    libnvml.so is the library behind nvidia-smi, and
+    pynvml is a Python wrapper around it. We use it to get GPU
+    status without initializing CUDA context in the current process.
+    Historically, there are two packages that provide pynvml:
+    - `nvidia-ml-py` (https://pypi.org/project/nvidia-ml-py/): The official
+        wrapper. It is a dependency of vLLM, and is installed when users
+        install vLLM. It provides a Python module named `pynvml`.
+    - `pynvml` (https://pypi.org/project/pynvml/): An unofficial wrapper.
+        Prior to version 12.0, it also provides a Python module `pynvml`,
+        and therefore conflicts with the official one. What's worse,
+        the module is a Python package, and has higher priority than
+        the official one which is a standalone Python file.
+        This causes errors when both of them are installed.
+        Starting from version 12.0, it migrates to a new module
+        named `pynvml_utils` to avoid the conflict.
+    
+    TL;DR: if users have pynvml<12.0 installed, it will cause problems.
+    Otherwise, `import pynvml` will import the correct module.
+    We take the safest approach here, to manually import the correct
+    `pynvml.py` module from the `nvidia-ml-py` package.
+    """
+    if TYPE_CHECKING:
+        import pynvml
+        return pynvml
+    if "pynvml" in sys.modules:
+        import pynvml
+        if pynvml.__file__.endswith("__init__.py"):
+            # this is pynvml < 12.0
+            raise RuntimeError(
+                "You are using a deprecated `pynvml` package. "
+                "Please uninstall `pynvml` or upgrade to at least"
+                " version 12.0. See https://pypi.org/project/pynvml "
+                "for more information.")
+        return sys.modules["pynvml"]
+    import importlib.util
+    import os
+    import site
+    for site_dir in site.getsitepackages():
+        pynvml_path = os.path.join(site_dir, "pynvml.py")
+        if os.path.exists(pynvml_path):
+            spec = importlib.util.spec_from_file_location(
+                "pynvml", pynvml_path)
+            pynvml = importlib.util.module_from_spec(spec)
+            sys.modules["pynvml"] = pynvml
+            spec.loader.exec_module(pynvml)
+            return pynvml
diff --git a/vllm/v1/attention/backends/flash_attn.py b/vllm/v1/attention/backends/flash_attn.py
old mode 100644
new mode 100755
index b85902626fbb1..7c9e9e14de0be
--- a/vllm/v1/attention/backends/flash_attn.py
+++ b/vllm/v1/attention/backends/flash_attn.py
@@ -1,3 +1,4 @@
+# SPDX-License-Identifier: Apache-2.0
 """Attention layer with FlashAttention."""
 from dataclasses import dataclass
 from typing import Any, Dict, List, Optional, Tuple, Type
@@ -10,11 +11,15 @@ import triton.language as tl
 from vllm.attention.backends.abstract import (AttentionBackend, AttentionImpl,
                                               AttentionMetadata, AttentionType)
 from vllm.envs import VLLM_FLASH_ATTN_VERSION
+from vllm.logger import init_logger
 from vllm.platforms import current_platform
 from vllm.utils import cdiv
-from vllm.vllm_flash_attn import (flash_attn_varlen_func,
+from vllm.vllm_flash_attn import (fa_version_unsupported_reason,
+                                  flash_attn_varlen_func,
                                   is_fa_version_supported)
 
+logger = init_logger(__name__)
+
 
 class FlashAttentionBackend(AttentionBackend):
 
@@ -146,6 +151,11 @@ class FlashAttentionImpl(AttentionImpl):
             assert VLLM_FLASH_ATTN_VERSION in [2, 3]
             self.fa_version = VLLM_FLASH_ATTN_VERSION
 
+        if not is_fa_version_supported(self.fa_version):
+            logger.error("Cannot use FA version %d is not supported due to %s",
+                         self.fa_version,
+                         fa_version_unsupported_reason(self.fa_version))
+
         assert is_fa_version_supported(self.fa_version)
 
     def forward(
diff --git a/vllm/v1/core/encoder_cache_manager.py b/vllm/v1/core/encoder_cache_manager.py
index 0cd8c806a3e47..651bc01aa5cf6 100644
--- a/vllm/v1/core/encoder_cache_manager.py
+++ b/vllm/v1/core/encoder_cache_manager.py
@@ -1,3 +1,5 @@
+# SPDX-License-Identifier: Apache-2.0
+
 from typing import TYPE_CHECKING, Dict, List, Set, Tuple
 
 from vllm.logger import init_logger
@@ -38,7 +40,8 @@ class EncoderCacheManager:
     def get_cached_input_ids(self, request: Request) -> Set[int]:
         return self.cached.get(request.request_id, set())
 
-    def free(self, request: Request, input_id: int) -> None:
+    def free_encoder_input(self, request: Request, input_id: int) -> None:
+        """Free a single encoder input id for the request."""
         req_id = request.request_id
         if req_id not in self.cached:
             return
@@ -49,6 +52,12 @@ class EncoderCacheManager:
         self.num_free_slots += request.get_num_encoder_tokens(input_id)
         self.freed.append((req_id, input_id))
 
+    def free(self, request: Request) -> None:
+        """Free all cached input ids for the request."""
+        input_ids = self.get_cached_input_ids(request)
+        for input_id in input_ids:
+            self.free_encoder_input(request, input_id)
+
     def get_freed_ids(self) -> List[Tuple[str, int]]:
         freed = self.freed
         self.freed = []
diff --git a/vllm/v1/core/kv_cache_manager.py b/vllm/v1/core/kv_cache_manager.py
index 18fdfdfe4a010..94086e4a1f75b 100644
--- a/vllm/v1/core/kv_cache_manager.py
+++ b/vllm/v1/core/kv_cache_manager.py
@@ -1,5 +1,7 @@
+# SPDX-License-Identifier: Apache-2.0
+
 from collections import defaultdict
-from typing import Dict, Iterable, List, Optional, Tuple
+from typing import DefaultDict, Dict, Iterable, List, Optional, Tuple
 
 from vllm.logger import init_logger
 from vllm.utils import cdiv
@@ -67,7 +69,13 @@ class KVCacheManager:
         # Mapping from request ID to blocks to track the blocks allocated
         # for each request, so that we can free the blocks when the request
         # is finished.
-        self.req_to_blocks: Dict[str, List[KVCacheBlock]] = {}
+        self.req_to_blocks: DefaultDict[str,
+                                        List[KVCacheBlock]] = defaultdict(list)
+
+    @property
+    def usage(self) -> float:
+        return 1.0 - (self.free_block_queue.num_free_blocks /
+                      self.num_gpu_blocks)
 
     def get_computed_blocks(
             self, request: Request) -> Tuple[List[KVCacheBlock], int]:
@@ -110,33 +118,75 @@ class KVCacheManager:
         num_computed_tokens = len(computed_blocks) * self.block_size
         return computed_blocks, num_computed_tokens
 
-    def append_slots(
+    def allocate_slots(
         self,
         request: Request,
         num_tokens: int,
+        new_computed_blocks: Optional[List[KVCacheBlock]] = None
     ) -> Optional[List[KVCacheBlock]]:
-        """Append slots to the block table of the request.
-        We first append slots to already allocated blocks. If the allocated
-        blocks are not enough, we allocate new blocks.
+        """Add slots for a request with new tokens to append.
 
         Args:
-            request: The request to append slots.
-            num_tokens: The number of tokens to append.
+            request: The request to allocate slots.
+            num_tokens: The number of tokens to allocate. Note that this does
+                not include the tokens that have already been computed.
+            new_computed_blocks: A list of new computed blocks just hitting the
+                prefix caching.
+
+        Blocks layout:
+        -----------------------------------------------------------------------
+        | < computed > | < new computed > |    < new >    | < pre-allocated > |
+        -----------------------------------------------------------------------
+        |                  < required >                   |
+        --------------------------------------------------
+        |                    < full >                  |
+        ------------------------------------------------
+                                          | <new full> |
+                                          --------------
+        The following *_blocks are illustrated in this layout.
 
         Returns:
-            A list of new blocks if new blocks are allocated, or None
-            if new blocks are required but cannot be allocated.
+            A list of new allocated blocks.
         """
-        num_required_blocks = cdiv(request.num_computed_tokens + num_tokens,
+        if num_tokens == 0:
+            raise ValueError("num_tokens must be greater than 0")
+
+        new_computed_blocks = new_computed_blocks or []
+
+        # The number of computed tokens is the number of computed tokens plus
+        # the new prefix caching hits
+        num_computed_tokens = (request.num_computed_tokens +
+                               len(new_computed_blocks) * self.block_size)
+        num_required_blocks = cdiv(num_computed_tokens + num_tokens,
                                    self.block_size)
         req_blocks = self.req_to_blocks[request.request_id]
+        num_new_blocks = (num_required_blocks - len(req_blocks) -
+                          len(new_computed_blocks))
 
-        num_new_blocks = num_required_blocks - len(req_blocks)
-        if num_new_blocks > self.free_block_queue.num_free_blocks:
-            # Need to allocate new blocks due to insufficient pre-allocated
-            # slots, but we cannot allocate new blocks due to the limit.
+        # If a computed block of a request is an eviction candidate (in the
+        # free queue and ref_cnt == 0), it cannot be counted as a free block
+        # when allocating this request.
+        num_evictable_computed_blocks = sum(1 for blk in new_computed_blocks
+                                            if blk.ref_cnt == 0)
+        if (num_new_blocks > self.free_block_queue.num_free_blocks -
+                num_evictable_computed_blocks):
+            # Cannot allocate new blocks
             return None
 
+        # Touch the computed blocks to make sure they won't be evicted.
+        if self.enable_caching:
+            self._touch(new_computed_blocks)
+        else:
+            assert not new_computed_blocks, (
+                "Computed blocks should be empty when "
+                "prefix caching is disabled")
+
+        # Append the new computed blocks to the request blocks until now to
+        # avoid the case where the new blocks cannot be allocated.
+        req_blocks.extend(new_computed_blocks)
+
+        # Start to handle new blocks
+
         if num_new_blocks <= 0:
             # No new block is needed.
             new_blocks = []
@@ -155,112 +205,29 @@ class KVCacheManager:
             )
             assert num_new_blocks > 0
 
+            # Concatenate the computed block IDs and the new block IDs.
             new_blocks = self._get_new_blocks(num_new_blocks)
             req_blocks.extend(new_blocks)
 
         if not self.enable_caching:
             return new_blocks
 
-        num_computed_full_blocks = (request.num_computed_tokens //
-                                    self.block_size)
-
         # NOTE(rickyx): We are assuming the `num_tokens` are actual
         # tokens rather than lookahead slots (e.g. for speculative decoding).
         # TODO(rickyx): When supporting speculative decoding, we will need to
         # differentiate between them so that we can know how many blocks are
         # full after appending the actual tokens.
-        num_full_blocks_after_append = (request.num_computed_tokens +
-                                        num_tokens) // self.block_size
-        assert num_full_blocks_after_append <= len(req_blocks)
-
-        new_full_blocks = req_blocks[
-            num_computed_full_blocks:num_full_blocks_after_append]
+        num_full_blocks = (num_computed_tokens + num_tokens) // self.block_size
+        num_computed_full_blocks = num_computed_tokens // self.block_size
+        new_full_blocks = req_blocks[num_computed_full_blocks:num_full_blocks]
         if new_full_blocks:
             self._cache_full_blocks(
                 request=request,
                 blk_start_idx=num_computed_full_blocks,
-                full_blocks=new_full_blocks,
-                prev_block=req_blocks[num_computed_full_blocks - 1]
-                if num_computed_full_blocks >= 1 else None,
-            )
-
-        return new_blocks
-
-    def allocate_slots(
-        self,
-        request: Request,
-        num_tokens: int,
-        computed_blocks: List[KVCacheBlock],
-    ) -> Optional[List[KVCacheBlock]]:
-        """Allocate slots for a new request.
-
-        Args:
-            request: The request to allocate slots.
-            num_tokens: The number of tokens to allocate. Note that this does
-                not include the tokens that have already been computed.
-            computed_blocks: A list of computed blocks.
-
-        Returns:
-            A list of new allocated blocks.
-        """
-        if num_tokens == 0:
-            raise ValueError(
-                f"num_tokens must be greater than 0, got {num_tokens}")
-
-        # If a computed block of a request is an eviction candidate (in the
-        # free queue and ref_cnt == 0), it cannot be counted as a free block
-        # when allocating this request.
-        num_evictable_computed_blocks = sum(1 for blk in computed_blocks
-                                            if blk.ref_cnt == 0)
-
-        num_required_blocks = cdiv(num_tokens, self.block_size)
-        if (num_required_blocks > self.free_block_queue.num_free_blocks -
-                num_evictable_computed_blocks):
-            # Cannot allocate new blocks.
-            return None
-
-        # Touch the computed blocks to make sure they won't be evicted.
-        if self.enable_caching:
-            self._touch(computed_blocks)
-        else:
-            assert not computed_blocks, (
-                "Computed blocks should be empty when "
-                "prefix caching is disabled")
-
-        # Determine the number of new blocks to allocate considering
-        # preallocated blocks.
-        num_new_blocks = min(
-            num_required_blocks + self.num_preallocate_blocks,
-            self.free_block_queue.num_free_blocks,
-            # Should not exceed the maximum number of blocks per request.
-            # This is especially because the block table has the shape
-            # [..., max_num_blocks_per_req].
-            # TODO(woosuk): Check and reject requests if
-            # num_prompt_tokens + max_tokens > max_model_len.
-            self.max_num_blocks_per_req - len(computed_blocks),
-        )
-        assert num_new_blocks > 0
-
-        # Concatenate the computed block IDs and the new block IDs.
-        new_blocks = self._get_new_blocks(num_new_blocks)
-        self.req_to_blocks[request.request_id] = computed_blocks + new_blocks
-
-        if not self.enable_caching:
-            return new_blocks
-
-        num_computed_tokens = len(computed_blocks) * self.block_size
-        num_full_blocks = (num_computed_tokens + num_tokens) // self.block_size
-
-        new_full_blocks = self.req_to_blocks[
-            request.request_id][len(computed_blocks):num_full_blocks]
-        if new_full_blocks:
-            self._cache_full_blocks(
-                request=request,
-                blk_start_idx=len(computed_blocks),
                 # The new full blocks are the full blocks that are not computed.
                 full_blocks=new_full_blocks,
-                prev_block=computed_blocks[-1] if computed_blocks else None,
-            )
+                prev_block=(req_blocks[num_computed_full_blocks - 1]
+                            if num_computed_full_blocks > 0 else None))
 
         return new_blocks
 
diff --git a/vllm/v1/core/kv_cache_utils.py b/vllm/v1/core/kv_cache_utils.py
index bab99fe37caee..c801ab9e4cd5d 100644
--- a/vllm/v1/core/kv_cache_utils.py
+++ b/vllm/v1/core/kv_cache_utils.py
@@ -1,3 +1,4 @@
+# SPDX-License-Identifier: Apache-2.0
 """KV-Cache Utilities."""
 from collections.abc import Sequence
 from dataclasses import dataclass
@@ -262,8 +263,10 @@ def hash_block_tokens(
         The hash value of the block and the token ids in the block.
         The entire tuple is used as the hash key of the block.
     """
-    return BlockHashType(hash((parent_block_hash, *curr_block_token_ids)),
-                         tuple(curr_block_token_ids), extra_keys)
+    curr_block_token_ids_tuple = tuple(curr_block_token_ids)
+    return BlockHashType(
+        hash((parent_block_hash, curr_block_token_ids_tuple, extra_keys)),
+        curr_block_token_ids_tuple, extra_keys)
 
 
 def hash_request_tokens(block_size: int,
@@ -393,6 +396,10 @@ def _get_kv_cache_config_uniform_type(vllm_config: VllmConfig,
         num_blocks = num_gpu_blocks_override
 
     logger.info("# GPU blocks: %d", num_blocks)
+    max_concurrency = (num_blocks * vllm_config.cache_config.block_size /
+                       vllm_config.model_config.max_model_len)
+    logger.info("Maximum concurrency for %s tokens per request: %.2fx",
+                vllm_config.model_config.max_model_len, max_concurrency)
 
     per_layer_size = page_size * num_blocks
 
diff --git a/vllm/v1/core/scheduler.py b/vllm/v1/core/scheduler.py
index 8ded5e5787133..f4738bb33c603 100644
--- a/vllm/v1/core/scheduler.py
+++ b/vllm/v1/core/scheduler.py
@@ -1,3 +1,5 @@
+# SPDX-License-Identifier: Apache-2.0
+
 from collections import deque
 from dataclasses import dataclass
 from typing import (TYPE_CHECKING, Deque, Dict, Iterable, List, Optional, Set,
@@ -138,7 +140,7 @@ class Scheduler:
             assert num_new_tokens > 0
 
             while True:
-                new_blocks = self.kv_cache_manager.append_slots(
+                new_blocks = self.kv_cache_manager.allocate_slots(
                     request, num_new_tokens)
                 if new_blocks is None:
                     # The request cannot be scheduled.
@@ -202,7 +204,7 @@ class Scheduler:
                 # which have output tokens.
                 num_new_tokens = request.num_tokens - num_computed_tokens
                 if num_new_tokens == 0:
-                    # The happens when prompt length is divisible by the block
+                    # This happens when prompt length is divisible by the block
                     # size and all blocks are cached. Now we force to recompute
                     # the last block. Note that we have to re-compute an entire
                     # block because allocate_slots() assumes num_computed_tokens
@@ -247,8 +249,8 @@ class Scheduler:
                 token_budget -= num_new_tokens
                 request.status = RequestStatus.RUNNING
                 request.num_computed_tokens = num_computed_tokens
-                has_partial_request = (num_computed_tokens + num_new_tokens <
-                                       request.num_tokens)
+                has_partial_request = (num_computed_tokens + num_new_tokens
+                                       < request.num_tokens)
 
                 # Encoder-related.
                 if encoder_inputs_to_schedule:
@@ -269,6 +271,7 @@ class Scheduler:
 
         # Get the longest common prefix among all requests in the running queue.
         # This can be potentially used for cascade attention.
+        num_common_prefix_blocks = 0
         if self.running:
             any_request = self.running[0]
             num_common_prefix_blocks = (
@@ -411,6 +414,10 @@ class Scheduler:
         num_scheduled_tokens = scheduler_output.num_scheduled_tokens
         new_running: List[Request] = []
         outputs: List[EngineCoreOutput] = []
+
+        # NOTE(woosuk): As len(self.running) can be up to 1K or more, the below
+        # loop can be a performance bottleneck. We should do our best to avoid
+        # expensive operations inside the loop.
         for request in self.running:
             req_id = request.request_id
             request.num_computed_tokens += num_scheduled_tokens[req_id]
@@ -421,13 +428,16 @@ class Scheduler:
 
             cached_encoder_input_ids = (
                 self.encoder_cache_manager.get_cached_input_ids(request))
-            for input_id in list(cached_encoder_input_ids):
-                start_pos = request.mm_positions[input_id]["offset"]
-                num_tokens = request.mm_positions[input_id]["length"]
-                if start_pos + num_tokens <= request.num_computed_tokens:
-                    # The encoder output is already processed and stored
-                    # in the decoder's KV cache.
-                    self.encoder_cache_manager.free(request, input_id)
+            # OPTIMIZATION: Avoid list(set) if the set is empty.
+            if cached_encoder_input_ids:
+                for input_id in list(cached_encoder_input_ids):
+                    start_pos = request.mm_positions[input_id]["offset"]
+                    num_tokens = request.mm_positions[input_id]["length"]
+                    if start_pos + num_tokens <= request.num_computed_tokens:
+                        # The encoder output is already processed and stored
+                        # in the decoder's KV cache.
+                        self.encoder_cache_manager.free_encoder_input(
+                            request, input_id)
 
             if request.num_computed_tokens == request.num_tokens:
                 req_index = model_runner_output.req_id_to_index[req_id]
@@ -439,8 +449,10 @@ class Scheduler:
                 # TODO: Update the KV cache manager for prefix caching.
 
                 # Check for stop and update request state.
-                # This must be called before me make the EngineCoreOutput.
+                # This must be called before we make the EngineCoreOutput.
                 stopped = self._check_stop(request)
+                if stopped:
+                    self._free_request(request)
 
                 # Add EngineCoreOutput for this Request.
                 output = EngineCoreOutput(
@@ -466,7 +478,6 @@ class Scheduler:
         if (request.num_tokens >= self.max_model_len
                 or request.num_output_tokens >= request.max_tokens):
             request.status = RequestStatus.FINISHED_LENGTH_CAPPED
-            self._free_request(request)
             return True
 
         sampling_params = request.sampling_params
@@ -474,13 +485,11 @@ class Scheduler:
         if (not sampling_params.ignore_eos
                 and last_token_id == request.eos_token_id):
             request.status = RequestStatus.FINISHED_STOPPED
-            self._free_request(request)
             return True
 
         if last_token_id in (sampling_params.stop_token_ids or ()):
             request.status = RequestStatus.FINISHED_STOPPED
             request.stop_reason = last_token_id
-            self._free_request(request)
             return True
         return False
 
@@ -519,6 +528,7 @@ class Scheduler:
     def _free_request(self, request: Request) -> None:
         assert request.is_finished()
         self.kv_cache_manager.free(request)
+        self.encoder_cache_manager.free(request)
         self.running_reqs_data.pop(request.request_id, None)
         del self.requests[request.request_id]
         self.finished_req_ids.add(request.request_id)
@@ -536,6 +546,7 @@ class Scheduler:
         return SchedulerStats(
             num_running_reqs=len(self.running),
             num_waiting_reqs=len(self.waiting),
+            gpu_cache_usage=self.kv_cache_manager.usage,
         )
 
 
diff --git a/vllm/v1/engine/__init__.py b/vllm/v1/engine/__init__.py
index abe4952c4baff..912b92862c96e 100644
--- a/vllm/v1/engine/__init__.py
+++ b/vllm/v1/engine/__init__.py
@@ -1,3 +1,5 @@
+# SPDX-License-Identifier: Apache-2.0
+
 import enum
 from dataclasses import dataclass
 from typing import TYPE_CHECKING, List, Optional, Union
diff --git a/vllm/v1/engine/async_llm.py b/vllm/v1/engine/async_llm.py
index 6dc68b3a16099..3c4e35e4aa274 100644
--- a/vllm/v1/engine/async_llm.py
+++ b/vllm/v1/engine/async_llm.py
@@ -1,3 +1,5 @@
+# SPDX-License-Identifier: Apache-2.0
+
 import asyncio
 import os
 from typing import AsyncGenerator, List, Mapping, Optional, Type, Union
@@ -24,7 +26,8 @@ from vllm.v1.engine.core_client import EngineCoreClient
 from vllm.v1.engine.output_processor import OutputProcessor
 from vllm.v1.engine.processor import Processor
 from vllm.v1.executor.abstract import Executor
-from vllm.v1.metrics.loggers import LoggingStatLogger, StatLoggerBase
+from vllm.v1.metrics.loggers import (LoggingStatLogger, PrometheusStatLogger,
+                                     StatLoggerBase)
 from vllm.v1.metrics.stats import IterationStats, SchedulerStats
 
 logger = init_logger(__name__)
@@ -46,13 +49,14 @@ class AsyncLLM(EngineClient):
 
         assert start_engine_loop
 
+        self.model_config = vllm_config.model_config
+
         self.log_requests = log_requests
         self.log_stats = log_stats
         self.stat_loggers: List[StatLoggerBase] = [
             LoggingStatLogger(),
-            # TODO(rob): PrometheusStatLogger(),
+            PrometheusStatLogger(vllm_config.model_config),
         ]
-        self.model_config = vllm_config.model_config
 
         # Tokenizer (+ ensure liveness if running in another process).
         self.tokenizer = init_tokenizer_from_configs(
@@ -272,7 +276,7 @@ class AsyncLLM(EngineClient):
 
                 # 4) Logging.
                 # TODO(rob): make into a coroutine and launch it in
-                # background thread once we add Prometheus.
+                # background thread once Prometheus overhead is non-trivial.
                 assert iteration_stats is not None
                 self._log_stats(
                     scheduler_stats=outputs.scheduler_stats,
@@ -302,7 +306,8 @@ class AsyncLLM(EngineClient):
             return
 
         for logger in self.stat_loggers:
-            logger.log(scheduler_stats=scheduler_stats)
+            logger.log(scheduler_stats=scheduler_stats,
+                       iteration_stats=iteration_stats)
 
     def encode(
         self,
diff --git a/vllm/v1/engine/core.py b/vllm/v1/engine/core.py
index cf94033a38d96..29a9ac1868f27 100644
--- a/vllm/v1/engine/core.py
+++ b/vllm/v1/engine/core.py
@@ -1,3 +1,5 @@
+# SPDX-License-Identifier: Apache-2.0
+
 import pickle
 import queue
 import signal
@@ -42,7 +44,7 @@ class EngineCore:
     ):
         assert vllm_config.model_config.runner_type != "pooling"
 
-        logger.info("Initializing an LLM engine (v%s) with config: %s",
+        logger.info("Initializing a V1 LLM engine (v%s) with config: %s",
                     VLLM_VERSION, vllm_config)
 
         # Setup Model.
diff --git a/vllm/v1/engine/core_client.py b/vllm/v1/engine/core_client.py
index f3b992d6873e7..247380ef7cfed 100644
--- a/vllm/v1/engine/core_client.py
+++ b/vllm/v1/engine/core_client.py
@@ -1,3 +1,5 @@
+# SPDX-License-Identifier: Apache-2.0
+
 import asyncio
 import os
 import signal
diff --git a/vllm/v1/engine/detokenizer.py b/vllm/v1/engine/detokenizer.py
index 4a8b61beec037..6d800f026b22a 100644
--- a/vllm/v1/engine/detokenizer.py
+++ b/vllm/v1/engine/detokenizer.py
@@ -1,3 +1,5 @@
+# SPDX-License-Identifier: Apache-2.0
+
 from dataclasses import dataclass
 from typing import List, Optional, Union
 
diff --git a/vllm/v1/engine/llm_engine.py b/vllm/v1/engine/llm_engine.py
index 55d314ebeb955..e0452bcad7ba7 100644
--- a/vllm/v1/engine/llm_engine.py
+++ b/vllm/v1/engine/llm_engine.py
@@ -1,3 +1,5 @@
+# SPDX-License-Identifier: Apache-2.0
+
 from typing import Dict, List, Mapping, Optional, Type, Union
 
 from typing_extensions import TypeVar
diff --git a/vllm/v1/engine/mm_input_mapper.py b/vllm/v1/engine/mm_input_mapper.py
index d83460a40ad26..83a0d9db161d2 100644
--- a/vllm/v1/engine/mm_input_mapper.py
+++ b/vllm/v1/engine/mm_input_mapper.py
@@ -1,3 +1,5 @@
+# SPDX-License-Identifier: Apache-2.0
+
 from typing import Any, Dict, List, Optional
 
 from vllm.config import ModelConfig
diff --git a/vllm/v1/engine/output_processor.py b/vllm/v1/engine/output_processor.py
index 564eab51bd3a8..aeefd52399d53 100644
--- a/vllm/v1/engine/output_processor.py
+++ b/vllm/v1/engine/output_processor.py
@@ -1,3 +1,5 @@
+# SPDX-License-Identifier: Apache-2.0
+
 import asyncio
 from dataclasses import dataclass
 from typing import Dict, List, Optional
@@ -8,7 +10,7 @@ from vllm.transformers_utils.tokenizer_group import BaseTokenizerGroup
 from vllm.v1.engine import EngineCoreOutput, EngineCoreRequest
 from vllm.v1.engine.detokenizer import (DetokenizerOutput,
                                         IncrementalDetokenizer)
-from vllm.v1.metrics.stats import IterationStats
+from vllm.v1.metrics.stats import IterationStats, RequestStateStats
 
 
 @dataclass
@@ -27,6 +29,7 @@ class RequestState:
         prompt: Optional[str],
         prompt_token_ids: List[int],
         detokenizer: IncrementalDetokenizer,
+        arrival_time: float,
         queue: Optional[asyncio.Queue[RequestOutput]],
     ):
         self.request_id = request_id
@@ -37,6 +40,8 @@ class RequestState:
         self.is_prefilling = True
         self.queue = queue
 
+        self.stats = RequestStateStats(last_token_time=arrival_time)
+
     @classmethod
     def from_new_request(
         cls,
@@ -52,6 +57,7 @@ class RequestState:
                 tokenizer=tokenizer,
                 request=request,
             ),
+            arrival_time=request.arrival_time,
             queue=queue,
         )
 
@@ -146,7 +152,8 @@ class OutputProcessor:
             # 1) Compute stats for this iteration.
             iteration_stats.update_from_output(engine_core_output,
                                                req_state.is_prefilling,
-                                               req_state.prompt_len)
+                                               req_state.prompt_len,
+                                               req_state.stats)
             req_state.is_prefilling = False
 
             # 2) Detokenize the token ids into text.
@@ -171,6 +178,10 @@ class OutputProcessor:
                         # detected stop string, abort needed in EngineCore.
                         reqs_to_abort.append(req_id)
 
+                    # Track per-request stats
+                    iteration_stats.update_from_finished_request(
+                        request_output, req_state.stats)
+
         return OutputProcessorOutput(
             request_outputs=request_outputs,
             reqs_to_abort=reqs_to_abort,
diff --git a/vllm/v1/engine/processor.py b/vllm/v1/engine/processor.py
index 43419d2ff5381..366287951ed04 100644
--- a/vllm/v1/engine/processor.py
+++ b/vllm/v1/engine/processor.py
@@ -1,3 +1,5 @@
+# SPDX-License-Identifier: Apache-2.0
+
 import time
 from typing import Mapping, Optional, Union
 
@@ -206,6 +208,11 @@ class Processor:
         if prompt_ids is None or len(prompt_ids) == 0:
             raise ValueError("Prompt cannot be empty")
 
+        if len(prompt_ids) >= self.model_config.max_model_len:
+            raise ValueError(
+                f"Prompt length of {len(prompt_ids)} is longer than the "
+                f"maximum model length of {self.model_config.max_model_len}.")
+
         if self.model_config.is_multimodal_model:
             max_prompt_len = self.model_config.max_model_len
 
diff --git a/vllm/v1/executor/abstract.py b/vllm/v1/executor/abstract.py
index 131be759842c7..ac10d43eb0d54 100644
--- a/vllm/v1/executor/abstract.py
+++ b/vllm/v1/executor/abstract.py
@@ -1,3 +1,5 @@
+# SPDX-License-Identifier: Apache-2.0
+
 from typing import Type
 
 from vllm.config import VllmConfig
diff --git a/vllm/v1/executor/multiproc_executor.py b/vllm/v1/executor/multiproc_executor.py
index f6cf35da0106b..e3f07172d8cd9 100644
--- a/vllm/v1/executor/multiproc_executor.py
+++ b/vllm/v1/executor/multiproc_executor.py
@@ -1,3 +1,5 @@
+# SPDX-License-Identifier: Apache-2.0
+
 import os
 import pickle
 import signal
diff --git a/vllm/v1/kv_cache_interface.py b/vllm/v1/kv_cache_interface.py
index 6d5cc32ffc5b8..eddfb5949ebe6 100644
--- a/vllm/v1/kv_cache_interface.py
+++ b/vllm/v1/kv_cache_interface.py
@@ -1,3 +1,5 @@
+# SPDX-License-Identifier: Apache-2.0
+
 from dataclasses import dataclass
 from typing import Dict, List
 
diff --git a/vllm/v1/metrics/loggers.py b/vllm/v1/metrics/loggers.py
index 8feeef17542e6..f736e38f192d1 100644
--- a/vllm/v1/metrics/loggers.py
+++ b/vllm/v1/metrics/loggers.py
@@ -1,8 +1,15 @@
+# SPDX-License-Identifier: Apache-2.0
+
 import time
 from abc import ABC, abstractmethod
+from typing import List
 
+import numpy as np
+import prometheus_client
+
+from vllm.config import ModelConfig
 from vllm.logger import init_logger
-from vllm.v1.metrics.stats import SchedulerStats
+from vllm.v1.metrics.stats import IterationStats, SchedulerStats
 
 logger = init_logger(__name__)
 
@@ -12,27 +19,190 @@ _LOCAL_LOGGING_INTERVAL_SEC = 5.0
 class StatLoggerBase(ABC):
 
     @abstractmethod
-    def log(self, scheduler_stats: SchedulerStats):
+    def log(self, scheduler_stats: SchedulerStats,
+            iteration_stats: IterationStats):
         ...
 
 
 class LoggingStatLogger(StatLoggerBase):
 
     def __init__(self):
-        self.last_log_time = time.monotonic()
+        self._reset(time.monotonic())
 
-    def log(self, scheduler_stats: SchedulerStats):
+    def _reset(self, now):
+        self.last_log_time = now
+
+        # Tracked stats over current local logging interval.
+        self.num_prompt_tokens: List[int] = []
+        self.num_generation_tokens: List[int] = []
+
+    def _local_interval_elapsed(self, now: float) -> bool:
+        # Log every _LOCAL_LOGGING_INTERVAL_SEC.
+        elapsed_time = now - self.last_log_time
+        return elapsed_time > _LOCAL_LOGGING_INTERVAL_SEC
+
+    def _track_iteration_stats(self, iteration_stats: IterationStats):
+        # Save tracked stats for token counters.
+        self.num_prompt_tokens.append(iteration_stats.num_prompt_tokens)
+        self.num_generation_tokens.append(
+            iteration_stats.num_generation_tokens)
+
+    def _get_throughput(self, tracked_stats: List[int], now: float) -> float:
+        # Compute summary metrics for tracked stats
+        return float(np.sum(tracked_stats) / (now - self.last_log_time))
+
+    def log(self, scheduler_stats: SchedulerStats,
+            iteration_stats: IterationStats):
         """Log Stats to standard output."""
 
-        # Log every _LOCAL_LOGGING_INTERVAL_SEC.
+        self._track_iteration_stats(iteration_stats)
+
         now = time.monotonic()
-        if now - self.last_log_time < _LOCAL_LOGGING_INTERVAL_SEC:
+        if not self._local_interval_elapsed(now):
             return
-        self.last_log_time = now
+
+        prompt_throughput = self._get_throughput(self.num_prompt_tokens, now)
+        generation_throughput = self._get_throughput(
+            self.num_generation_tokens, now)
+
+        self._reset(now)
 
         # Format and print output.
         logger.info(
-            "Running: %d reqs, Waiting: %d reqs ",
+            "Avg prompt throughput: %.1f tokens/s, "
+            "Avg generation throughput: %.1f tokens/s, "
+            "Running: %d reqs, Waiting: %d reqs "
+            "GPU KV cache usage: %.1f%%.",
+            prompt_throughput,
+            generation_throughput,
             scheduler_stats.num_running_reqs,
             scheduler_stats.num_waiting_reqs,
+            scheduler_stats.gpu_cache_usage * 100,
         )
+
+
+class PrometheusStatLogger(StatLoggerBase):
+
+    def __init__(self, model_config: ModelConfig):
+        self._unregister_vllm_metrics()
+
+        labelnames = ["model_name"]
+        labelvalues = [model_config.served_model_name]
+
+        max_model_len = model_config.max_model_len
+
+        self.gauge_scheduler_running = prometheus_client.Gauge(
+            name="vllm:num_requests_running",
+            documentation="Number of requests in model execution batches.",
+            labelnames=labelnames).labels(*labelvalues)
+
+        self.gauge_scheduler_waiting = prometheus_client.Gauge(
+            name="vllm:num_requests_waiting",
+            documentation="Number of requests waiting to be processed.",
+            labelnames=labelnames).labels(*labelvalues)
+
+        self.gauge_gpu_cache_usage = prometheus_client.Gauge(
+            name="vllm:gpu_cache_usage_perc",
+            documentation="GPU KV-cache usage. 1 means 100 percent usage.",
+            labelnames=labelnames).labels(*labelvalues)
+
+        self.counter_prompt_tokens = prometheus_client.Counter(
+            name="vllm:prompt_tokens_total",
+            documentation="Number of prefill tokens processed.",
+            labelnames=labelnames).labels(*labelvalues)
+
+        self.counter_generation_tokens = prometheus_client.Counter(
+            name="vllm:generation_tokens_total",
+            documentation="Number of generation tokens processed.",
+            labelnames=labelnames).labels(*labelvalues)
+
+        self.histogram_num_prompt_tokens_request = \
+            prometheus_client.Histogram(
+                name="vllm:request_prompt_tokens",
+                documentation="Number of prefill tokens processed.",
+                buckets=build_1_2_5_buckets(max_model_len),
+                labelnames=labelnames).labels(*labelvalues)
+
+        self.histogram_num_generation_tokens_request = \
+            prometheus_client.Histogram(
+                name="vllm:request_generation_tokens",
+                documentation="Number of generation tokens processed.",
+                buckets=build_1_2_5_buckets(max_model_len),
+                labelnames=labelnames).labels(*labelvalues)
+
+        self.histogram_time_to_first_token = \
+            prometheus_client.Histogram(
+                name="vllm:time_to_first_token_seconds",
+                documentation="Histogram of time to first token in seconds.",
+                buckets=[
+                    0.001, 0.005, 0.01, 0.02, 0.04, 0.06, 0.08, 0.1, 0.25, 0.5,
+                    0.75, 1.0, 2.5, 5.0, 7.5, 10.0
+                ],
+                labelnames=labelnames).labels(*labelvalues)
+
+        self.histogram_time_per_output_token = \
+            prometheus_client.Histogram(
+                name="vllm:time_per_output_token_seconds",
+                documentation="Histogram of time per output token in seconds.",
+                buckets=[
+                    0.01, 0.025, 0.05, 0.075, 0.1, 0.15, 0.2, 0.3, 0.4, 0.5,
+                    0.75, 1.0, 2.5
+                ],
+                labelnames=labelnames).labels(*labelvalues)
+
+    def log(self, scheduler_stats: SchedulerStats,
+            iteration_stats: IterationStats):
+        """Log to prometheus."""
+        self.gauge_scheduler_running.set(scheduler_stats.num_running_reqs)
+        self.gauge_scheduler_waiting.set(scheduler_stats.num_waiting_reqs)
+
+        self.gauge_gpu_cache_usage.set(scheduler_stats.gpu_cache_usage)
+
+        self.counter_prompt_tokens.inc(iteration_stats.num_prompt_tokens)
+        self.counter_generation_tokens.inc(
+            iteration_stats.num_generation_tokens)
+
+        for finished_request in iteration_stats.finished_requests:
+            self.histogram_num_prompt_tokens_request.observe(
+                finished_request.num_prompt_tokens)
+            self.histogram_num_generation_tokens_request.observe(
+                finished_request.num_generation_tokens)
+
+        for ttft in iteration_stats.time_to_first_tokens_iter:
+            self.histogram_time_to_first_token.observe(ttft)
+        for tpot in iteration_stats.time_per_output_tokens_iter:
+            self.histogram_time_per_output_token.observe(tpot)
+
+    @staticmethod
+    def _unregister_vllm_metrics():
+        # Unregister any existing vLLM collectors (for CI/CD
+        for collector in list(prometheus_client.REGISTRY._collector_to_names):
+            if hasattr(collector, "_name") and "vllm" in collector._name:
+                prometheus_client.REGISTRY.unregister(collector)
+
+
+def build_buckets(mantissa_lst: List[int], max_value: int) -> List[int]:
+    """
+    Builds a list of buckets with increasing powers of 10 multiplied by
+    mantissa values until the value exceeds the specified maximum.
+
+    """
+    exponent = 0
+    buckets: List[int] = []
+    while True:
+        for m in mantissa_lst:
+            value = m * 10**exponent
+            if value <= max_value:
+                buckets.append(value)
+            else:
+                return buckets
+        exponent += 1
+
+
+def build_1_2_5_buckets(max_value: int) -> List[int]:
+    """
+    Example:
+    >>> build_1_2_5_buckets(100)
+    [1, 2, 5, 10, 20, 50, 100]
+    """
+    return build_buckets([1, 2, 5], max_value)
diff --git a/vllm/v1/metrics/stats.py b/vllm/v1/metrics/stats.py
index 60cb986f8bbce..88f2c083530ed 100644
--- a/vllm/v1/metrics/stats.py
+++ b/vllm/v1/metrics/stats.py
@@ -1,7 +1,11 @@
+# SPDX-License-Identifier: Apache-2.0
+
+import time
 from dataclasses import dataclass
-from typing import TYPE_CHECKING
+from typing import TYPE_CHECKING, List
 
 if TYPE_CHECKING:
+    from vllm.outputs import RequestOutput
     from vllm.v1.engine import EngineCoreOutput
 
 
@@ -12,10 +16,26 @@ class SchedulerStats:
     num_running_reqs: int = 0
     num_waiting_reqs: int = 0
 
-    # gpu_cache_usage: float = 0.0
+    gpu_cache_usage: float = 0.0
     # gpu_prefix_cache_hit_rate: float = 0.0
 
 
+@dataclass
+class RequestStateStats:
+    """Stats that need to be tracked across delta updates."""
+
+    num_generation_tokens: int = 0
+    last_token_time: float = 0.0
+
+
+@dataclass
+class FinishedRequestStats:
+    """Stats associated with a finished request."""
+
+    num_prompt_tokens: int = 0
+    num_generation_tokens: int = 0
+
+
 class IterationStats:
     """Stats associated with a single set of EngineCoreOutputs."""
 
@@ -23,17 +43,38 @@ class IterationStats:
         self.log_stats = log_stats
         self.num_generation_tokens = 0
         self.num_prompt_tokens = 0
+        self.finished_requests: List[FinishedRequestStats] = []
+        self.time_to_first_tokens_iter: List[float] = []
+        self.time_per_output_tokens_iter: List[float] = []
 
     def update_from_output(self, output: "EngineCoreOutput",
-                           is_prefilling: bool, prompt_len: int):
+                           is_prefilling: bool, prompt_len: int,
+                           request_state_stats: RequestStateStats):
         if not self.log_stats:
             return
 
-        self.num_generation_tokens += len(output.new_token_ids)
+        num_new_generation_tokens = len(output.new_token_ids)
+        now = time.time()
+        last_token_latency = now - request_state_stats.last_token_time
+
+        self.num_generation_tokens += num_new_generation_tokens
         if is_prefilling:
             # This relies on the invariant that EngineCore does
             # not stream outputs for partially completed prefills
             # (scheduler.update_from_output makes EngineCoreOutput
             # iff num_computed_tokens == num_tokens).
-            assert (len(output.new_token_ids) > 0)
+            assert (num_new_generation_tokens > 0)
             self.num_prompt_tokens += prompt_len
+
+            self.time_to_first_tokens_iter.append(last_token_latency)
+        else:
+            self.time_per_output_tokens_iter.append(last_token_latency)
+
+        request_state_stats.num_generation_tokens += num_new_generation_tokens
+        request_state_stats.last_token_time = now
+
+    def update_from_finished_request(self, request_output: "RequestOutput",
+                                     request_state_stats: RequestStateStats):
+        self.finished_requests.append(
+            FinishedRequestStats(len(request_output.prompt_token_ids),
+                                 request_state_stats.num_generation_tokens))
diff --git a/vllm/v1/outputs.py b/vllm/v1/outputs.py
index acc3a944e21b9..6e82bffd7e5c9 100644
--- a/vllm/v1/outputs.py
+++ b/vllm/v1/outputs.py
@@ -1,3 +1,5 @@
+# SPDX-License-Identifier: Apache-2.0
+
 from dataclasses import dataclass
 from typing import Dict, List, Optional
 
@@ -8,7 +10,7 @@ import torch
 class SamplerOutput:
 
     # [num_reqs]
-    sampled_token_ids: List[int]
+    sampled_token_ids: torch.Tensor
 
     # [num_reqs, max_num_logprobs + 1]
     logprob_token_ids: Optional[torch.Tensor]
diff --git a/vllm/v1/request.py b/vllm/v1/request.py
index eefcdaf29e753..0519d9e787518 100644
--- a/vllm/v1/request.py
+++ b/vllm/v1/request.py
@@ -1,3 +1,5 @@
+# SPDX-License-Identifier: Apache-2.0
+
 import enum
 from typing import TYPE_CHECKING, List, Optional, Union
 
@@ -58,11 +60,13 @@ class Request:
 
         # Sanity check
         assert len(self.mm_inputs) == len(self.mm_positions)
-        assert len(self.mm_inputs) == len(self.mm_hashes)
+        if self.mm_hashes:
+            assert len(self.mm_inputs) == len(self.mm_hashes)
 
         # Cache the computed kv block hashes of the request to avoid
         # recomputing.
         self._kv_block_hashes: List[BlockHashType] = []
+        self.kv_block_hashes = ConstantList(self._kv_block_hashes)
 
         # Read-only views
         # Prevent directly appending to the these lists since
@@ -120,13 +124,9 @@ class Request:
         num_tokens = self.mm_positions[input_id]["length"]
         return num_tokens
 
-    @property
-    def kv_block_hashes(self) -> ConstantList["BlockHashType"]:
-        # Prevent directly appending to the kv_block_hashes.
-        return ConstantList(self._kv_block_hashes)
-
     def set_kv_block_hashes(self, value: List["BlockHashType"]) -> None:
         self._kv_block_hashes = value
+        self.kv_block_hashes = ConstantList(self._kv_block_hashes)
 
     def append_kv_block_hashes(self, block_hash: "BlockHashType") -> None:
         self._kv_block_hashes.append(block_hash)
diff --git a/vllm/v1/sample/metadata.py b/vllm/v1/sample/metadata.py
index d60f7eb5d76f9..8e54de34548dd 100644
--- a/vllm/v1/sample/metadata.py
+++ b/vllm/v1/sample/metadata.py
@@ -1,3 +1,5 @@
+# SPDX-License-Identifier: Apache-2.0
+
 from dataclasses import dataclass
 from typing import Dict, List, Optional, Set
 
diff --git a/vllm/v1/sample/ops/penalties.py b/vllm/v1/sample/ops/penalties.py
index 2796d049457d0..ba368b44ab9cc 100644
--- a/vllm/v1/sample/ops/penalties.py
+++ b/vllm/v1/sample/ops/penalties.py
@@ -1,3 +1,5 @@
+# SPDX-License-Identifier: Apache-2.0
+
 from typing import List, Set, Tuple
 
 import torch
diff --git a/vllm/v1/sample/ops/topk_topp_sampler.py b/vllm/v1/sample/ops/topk_topp_sampler.py
index f2007d85c61a5..27431001e3e7a 100644
--- a/vllm/v1/sample/ops/topk_topp_sampler.py
+++ b/vllm/v1/sample/ops/topk_topp_sampler.py
@@ -1,3 +1,5 @@
+# SPDX-License-Identifier: Apache-2.0
+
 from typing import Dict
 
 import torch
diff --git a/vllm/v1/sample/sampler.py b/vllm/v1/sample/sampler.py
index 7cd42ca211a22..3da7498e0dae5 100644
--- a/vllm/v1/sample/sampler.py
+++ b/vllm/v1/sample/sampler.py
@@ -1,3 +1,4 @@
+# SPDX-License-Identifier: Apache-2.0
 """A layer that samples the next tokens from the model's outputs."""
 from typing import Tuple
 
@@ -50,9 +51,8 @@ class Sampler(nn.Module):
         # Use int32 to reduce the tensor size.
         sampled = sampled.to(torch.int32)
 
-        # NOTE: CPU-GPU synchronization happens here.
         sampler_output = SamplerOutput(
-            sampled_token_ids=sampled.tolist(),
+            sampled_token_ids=sampled,
             logprob_token_ids=topk_indices,
             logprobs=topk_logprobs,
             prompt_logprob_token_ids=None,
diff --git a/vllm/v1/serial_utils.py b/vllm/v1/serial_utils.py
index b1cd5c11834f8..1791dfa2b6325 100644
--- a/vllm/v1/serial_utils.py
+++ b/vllm/v1/serial_utils.py
@@ -1,3 +1,5 @@
+# SPDX-License-Identifier: Apache-2.0
+
 import pickle
 
 
diff --git a/vllm/v1/stats/common.py b/vllm/v1/stats/common.py
index 500bc356fc179..09d382638bffd 100644
--- a/vllm/v1/stats/common.py
+++ b/vllm/v1/stats/common.py
@@ -1,3 +1,5 @@
+# SPDX-License-Identifier: Apache-2.0
+
 import time
 from dataclasses import dataclass
 from dataclasses import field as dataclass_field
@@ -311,8 +313,8 @@ class RequestStats:
             return []
         latency_s_lst = []
         for i in range(1, len(self.output_token_ts_s_lst)):
-            assert (self.output_token_ts_s_lst[i] >=
-                    self.output_token_ts_s_lst[i - 1])
+            assert (self.output_token_ts_s_lst[i]
+                    >= self.output_token_ts_s_lst[i - 1])
             latency_s = (self.output_token_ts_s_lst[i] -
                          self.output_token_ts_s_lst[i - 1])
             latency_s_lst.append(latency_s)
diff --git a/vllm/v1/utils.py b/vllm/v1/utils.py
index 8dfcf2dd78606..5494542c181d7 100644
--- a/vllm/v1/utils.py
+++ b/vllm/v1/utils.py
@@ -1,3 +1,5 @@
+# SPDX-License-Identifier: Apache-2.0
+
 import multiprocessing
 import os
 import weakref
diff --git a/vllm/v1/worker/block_table.py b/vllm/v1/worker/block_table.py
index 26a2084b131fa..8d0785243c716 100644
--- a/vllm/v1/worker/block_table.py
+++ b/vllm/v1/worker/block_table.py
@@ -1,3 +1,5 @@
+# SPDX-License-Identifier: Apache-2.0
+
 from typing import List
 
 import numpy as np
diff --git a/vllm/v1/worker/gpu_input_batch.py b/vllm/v1/worker/gpu_input_batch.py
index 28d8e39053874..39708f833fd58 100644
--- a/vllm/v1/worker/gpu_input_batch.py
+++ b/vllm/v1/worker/gpu_input_batch.py
@@ -1,3 +1,5 @@
+# SPDX-License-Identifier: Apache-2.0
+
 # Datastructures defining an input batch
 
 from dataclasses import dataclass
diff --git a/vllm/v1/worker/gpu_model_runner.py b/vllm/v1/worker/gpu_model_runner.py
index f71c3e9a25a6a..f062c8a86c7f2 100644
--- a/vllm/v1/worker/gpu_model_runner.py
+++ b/vllm/v1/worker/gpu_model_runner.py
@@ -1,3 +1,5 @@
+# SPDX-License-Identifier: Apache-2.0
+
 import gc
 import time
 from typing import TYPE_CHECKING, Dict, List, Optional, Tuple, cast
@@ -187,7 +189,8 @@ class GPUModelRunner:
 
         # OPTIMIZATION: Cache the tensors rather than creating them every step.
         self.arange_np = np.arange(max(self.max_num_reqs + 1,
-                                       self.max_model_len),
+                                       self.max_model_len,
+                                       self.max_num_tokens),
                                    dtype=np.int32)
         # NOTE(woosuk): These tensors are "stateless", i.e., they are literally
         # a faster version of creating a new tensor every time. Thus, we should
@@ -224,7 +227,7 @@ class GPUModelRunner:
 
     def _update_states(self, scheduler_output: "SchedulerOutput") -> None:
         # Remove stopped requests from the cached states.
-        # Keep the states of the pre-empted requests.
+        # Keep the states of the preempted requests.
         for req_id in scheduler_output.finished_req_ids:
             self.requests.pop(req_id, None)
             self.encoder_cache.pop(req_id, None)
@@ -378,8 +381,15 @@ class GPUModelRunner:
 
         # Get batched arange.
         # E.g., [2, 5, 3] -> [0, 1, 0, 1, 2, 3, 4, 0, 1, 2]
-        arange = np.concatenate(
-            [self.arange_np[:n] for n in num_scheduled_tokens])
+        # Equivalent to but faster than:
+        # np.concatenate([np.arange(n) for n in num_scheduled_tokens])
+        # Step 1. [2, 5, 3] -> [2, 7, 10]
+        cu_num_tokens = np.cumsum(num_scheduled_tokens)
+        # Step 2. [2, 7, 10] -> [0, 0, 2, 2, 2, 2, 2, 7, 7, 7]
+        cumsums_offsets = np.repeat(cu_num_tokens - num_scheduled_tokens,
+                                    num_scheduled_tokens)
+        # Step 3. [0, 1, 0, 1, 2, 3, 4, 0, 1, 2]
+        arange = self.arange_np[:total_num_scheduled_tokens] - cumsums_offsets
 
         # Get positions.
         positions_np = self.positions_np[:total_num_scheduled_tokens]
@@ -426,8 +436,7 @@ class GPUModelRunner:
 
         # Prepare the attention metadata.
         self.query_start_loc_np[0] = 0
-        np.cumsum(num_scheduled_tokens,
-                  out=self.query_start_loc_np[1:num_reqs + 1])
+        self.query_start_loc_np[1:num_reqs + 1] = cu_num_tokens
 
         self.seq_lens_np[:num_reqs] = (
             self.input_batch.num_computed_tokens_cpu[:num_reqs] +
@@ -805,10 +814,10 @@ class GPUModelRunner:
             sampling_metadata=sampling_metadata,
         )
 
-        sampled_token_ids = sampler_output.sampled_token_ids
         # TODO(woosuk): The following loop can be slow since it iterates over
         # the requests one by one. Optimize.
         num_reqs = self.input_batch.num_reqs
+        request_seq_lens: List[Tuple[int, CachedRequestState, int]] = []
         for i, req_id in enumerate(self.input_batch.req_ids[:num_reqs]):
             assert req_id is not None
             req_state = self.requests[req_id]
@@ -817,10 +826,10 @@ class GPUModelRunner:
             assert seq_len <= req_state.num_tokens
             if seq_len == req_state.num_tokens:
                 # Append the sampled token to the output token ids.
-                token_id = sampled_token_ids[i]
-                self.input_batch.token_ids_cpu[i, seq_len] = token_id
                 self.input_batch.num_tokens[i] += 1
-                req_state.output_token_ids.append(token_id)
+                # OPTIMIZATION: Priming the state updates for later updates.
+                req_state.output_token_ids.append(0)
+                request_seq_lens.append((i, req_state, seq_len))
             else:
                 # Ignore the sampled token from the partial request.
                 # Rewind the generator state as if the token was not sampled.
@@ -829,6 +838,21 @@ class GPUModelRunner:
                     # This relies on cuda-specific torch-internal impl details
                     generator.set_offset(generator.get_offset() - 4)
 
+        # num_reqs entries should be non-None
+        assert all(
+            req_id is not None for req_id in
+            self.input_batch.req_ids[:num_reqs]), "req_ids contains None"
+        req_ids = cast(List[str], self.input_batch.req_ids[:num_reqs])
+
+        # NOTE: GPU -> CPU Sync happens here.
+        # Move as many CPU operations as possible before this sync point.
+        sampled_token_ids = sampler_output.sampled_token_ids.tolist()
+        # Update with the actual token ids
+        for i, req_state, seq_len in request_seq_lens:
+            token_id = sampled_token_ids[i]
+            self.input_batch.token_ids_cpu[i, seq_len] = token_id
+            req_state.output_token_ids[-1] = token_id
+
         if sampler_output.logprob_token_ids is None:
             logprob_token_ids = None
         else:
@@ -838,12 +862,6 @@ class GPUModelRunner:
         else:
             logprobs = sampler_output.logprobs.cpu()
 
-        # num_reqs entries should be non-None
-        assert all(
-            req_id is not None for req_id in
-            self.input_batch.req_ids[:num_reqs]), "req_ids contains None"
-        req_ids = cast(List[str], self.input_batch.req_ids[:num_reqs])
-
         model_runner_output = ModelRunnerOutput(
             req_ids=req_ids,
             req_id_to_index=self.input_batch.req_id_to_index,
diff --git a/vllm/v1/worker/gpu_worker.py b/vllm/v1/worker/gpu_worker.py
index a8cf0aec3f17b..0adb69073397c 100644
--- a/vllm/v1/worker/gpu_worker.py
+++ b/vllm/v1/worker/gpu_worker.py
@@ -1,3 +1,4 @@
+# SPDX-License-Identifier: Apache-2.0
 """A GPU worker class."""
 import gc
 import os
diff --git a/vllm/version.py b/vllm/version.py
index 66e189dcedf71..70cd0289b441f 100644
--- a/vllm/version.py
+++ b/vllm/version.py
@@ -1,3 +1,5 @@
+# SPDX-License-Identifier: Apache-2.0
+
 try:
     from ._version import __version__, __version_tuple__
 except Exception as e:
diff --git a/vllm/worker/cache_engine.py b/vllm/worker/cache_engine.py
index 7ccd4571b19df..252fe06600dae 100644
--- a/vllm/worker/cache_engine.py
+++ b/vllm/worker/cache_engine.py
@@ -1,3 +1,4 @@
+# SPDX-License-Identifier: Apache-2.0
 """CacheEngine class for managing the KV cache."""
 from typing import List
 
@@ -56,7 +57,8 @@ class CacheEngine:
                                              model_config.dtype,
                                              cache_config.cache_dtype,
                                              self.block_size,
-                                             model_config.is_attention_free)
+                                             model_config.is_attention_free,
+                                             use_mla=model_config.use_mla)
 
         # Initialize the cache.
         self.gpu_cache = self._allocate_kv_cache(
@@ -109,7 +111,9 @@ class CacheEngine:
             parallel_config, LayerBlockType.attention)
 
         key_cache_block = cache_config.block_size * num_heads * head_size
-        value_cache_block = key_cache_block
+        # For MLA there is no value cache, since the latent vector
+        # is joint keys and values.
+        value_cache_block = key_cache_block if not model_config.use_mla else 0
         total = num_attention_layers * (key_cache_block + value_cache_block)
         if cache_config.cache_dtype == "auto":
             dtype = model_config.dtype
diff --git a/vllm/worker/cpu_enc_dec_model_runner.py b/vllm/worker/cpu_enc_dec_model_runner.py
index fa6775cbd6c66..71e32c5f7aca8 100644
--- a/vllm/worker/cpu_enc_dec_model_runner.py
+++ b/vllm/worker/cpu_enc_dec_model_runner.py
@@ -1,3 +1,5 @@
+# SPDX-License-Identifier: Apache-2.0
+
 import dataclasses
 from typing import TYPE_CHECKING, Any, Dict, List, Optional, Tuple, Type, cast
 
diff --git a/vllm/worker/cpu_model_runner.py b/vllm/worker/cpu_model_runner.py
index 4b429b67b36f8..1c3feece95a5a 100644
--- a/vllm/worker/cpu_model_runner.py
+++ b/vllm/worker/cpu_model_runner.py
@@ -1,3 +1,5 @@
+# SPDX-License-Identifier: Apache-2.0
+
 import dataclasses
 import weakref
 from collections import defaultdict
diff --git a/vllm/worker/cpu_pooling_model_runner.py b/vllm/worker/cpu_pooling_model_runner.py
index d31ba89e12375..c0744d63b8d09 100644
--- a/vllm/worker/cpu_pooling_model_runner.py
+++ b/vllm/worker/cpu_pooling_model_runner.py
@@ -1,3 +1,5 @@
+# SPDX-License-Identifier: Apache-2.0
+
 import dataclasses
 from typing import Any, Dict, List, Optional, Tuple, Type, Union
 
diff --git a/vllm/worker/cpu_worker.py b/vllm/worker/cpu_worker.py
index 3e5fcf11b9e16..27b1a2dd1be8c 100644
--- a/vllm/worker/cpu_worker.py
+++ b/vllm/worker/cpu_worker.py
@@ -1,3 +1,4 @@
+# SPDX-License-Identifier: Apache-2.0
 """A CPU worker class."""
 from typing import Dict, List, Optional, Set, Tuple, Type
 
diff --git a/vllm/worker/enc_dec_model_runner.py b/vllm/worker/enc_dec_model_runner.py
index 8a161b740042d..e2d338f757616 100644
--- a/vllm/worker/enc_dec_model_runner.py
+++ b/vllm/worker/enc_dec_model_runner.py
@@ -1,3 +1,5 @@
+# SPDX-License-Identifier: Apache-2.0
+
 import dataclasses
 import itertools
 from typing import Any, Dict, List, Optional, Tuple, Type, cast
diff --git a/vllm/worker/hpu_model_runner.py b/vllm/worker/hpu_model_runner.py
index a339c97a8383c..b846d4387ba58 100644
--- a/vllm/worker/hpu_model_runner.py
+++ b/vllm/worker/hpu_model_runner.py
@@ -1,3 +1,5 @@
+# SPDX-License-Identifier: Apache-2.0
+
 ###############################################################################
 # Copyright (C) 2024 Habana Labs, Ltd. an Intel Company
 ###############################################################################
diff --git a/vllm/worker/hpu_worker.py b/vllm/worker/hpu_worker.py
index 3c570212625c4..a1f31bead7294 100644
--- a/vllm/worker/hpu_worker.py
+++ b/vllm/worker/hpu_worker.py
@@ -1,3 +1,5 @@
+# SPDX-License-Identifier: Apache-2.0
+
 ###############################################################################
 # Copyright (C) 2024 Habana Labs, Ltd. an Intel Company
 ###############################################################################
@@ -173,13 +175,13 @@ class HPUWorker(LocalOrDistributedWorkerBase):
                 cpu_fallback_ctx as cpu_fallback_local_metric:
                 output = LocalOrDistributedWorkerBase.execute_model(
                     self, execute_model_req)
-            if (log_graph_compilation and gc_local_metric.stats()[0][1] > 0
-                ) or log_graph_compilation_all:
+            if (log_graph_compilation and gc_local_metric.stats()[0][1]
+                    > 0) or log_graph_compilation_all:
                 msg = ("VLLM_HPU_STEP_GRAPH_COMPILATION: "
                        f"{gc_local_metric.stats()}, {input_stats}")
                 logger.warning(msg)
-            if (log_cpu_fallbacks and cpu_fallback_local_metric.stats()[0][1] >
-                    0) or log_cpu_fallbacks_all:
+            if (log_cpu_fallbacks and cpu_fallback_local_metric.stats()[0][1]
+                    > 0) or log_cpu_fallbacks_all:
                 msg = ("VLLM_HPU_STEP_CPU_FALLBACK: "
                        f"{cpu_fallback_local_metric.stats()}, {input_stats}")
                 logger.warning(msg)
diff --git a/vllm/worker/model_runner.py b/vllm/worker/model_runner.py
index bf1a40d48a789..90f08b1dfde8a 100644
--- a/vllm/worker/model_runner.py
+++ b/vllm/worker/model_runner.py
@@ -1,3 +1,5 @@
+# SPDX-License-Identifier: Apache-2.0
+
 import dataclasses
 import gc
 import inspect
@@ -1066,6 +1068,7 @@ class GPUModelRunnerBase(ModelRunnerBase[TModelInputForGPU]):
             self.kv_cache_dtype,
             self.block_size,
             self.model_config.is_attention_free,
+            use_mla=self.model_config.use_mla,
         ) if needs_attn_backend else None
         if self.attn_backend:
             self.attn_state = self.attn_backend.get_state_cls()(
@@ -1345,6 +1348,10 @@ class GPUModelRunnerBase(ModelRunnerBase[TModelInputForGPU]):
 
             self.execute_model(model_input, kv_caches, intermediate_tensors)
             torch.cuda.synchronize()
+            if self.lora_config:
+                # Remove dummy loras.
+                assert self.lora_manager is not None
+                self.remove_all_loras()
             return
 
     def remove_all_loras(self):
@@ -1973,7 +1980,8 @@ class CUDAGraphRunner(nn.Module):
 
         # Copy the input tensors to the input buffers.
         self.input_buffers["input_ids"].copy_(input_ids, non_blocking=True)
-        self.input_buffers["positions"].copy_(positions, non_blocking=True)
+        if positions is not None:
+            self.input_buffers["positions"].copy_(positions, non_blocking=True)
 
         if self.backend_name != "NO_ATTENTION":
             self.input_buffers["slot_mapping"].copy_(
diff --git a/vllm/worker/model_runner_base.py b/vllm/worker/model_runner_base.py
index aef4bdcdd4bf9..9e33ef9f1bd04 100644
--- a/vllm/worker/model_runner_base.py
+++ b/vllm/worker/model_runner_base.py
@@ -1,3 +1,5 @@
+# SPDX-License-Identifier: Apache-2.0
+
 import dataclasses
 import pickle
 from abc import ABC, abstractmethod
diff --git a/vllm/worker/multi_step_model_runner.py b/vllm/worker/multi_step_model_runner.py
index 4aab09c80826b..90771e8ac75d8 100644
--- a/vllm/worker/multi_step_model_runner.py
+++ b/vllm/worker/multi_step_model_runner.py
@@ -1,3 +1,5 @@
+# SPDX-License-Identifier: Apache-2.0
+
 import dataclasses
 import functools
 from dataclasses import dataclass, field
diff --git a/vllm/worker/multi_step_tpu_worker.py b/vllm/worker/multi_step_tpu_worker.py
index e654f7172b266..3871199987cee 100644
--- a/vllm/worker/multi_step_tpu_worker.py
+++ b/vllm/worker/multi_step_tpu_worker.py
@@ -1,3 +1,5 @@
+# SPDX-License-Identifier: Apache-2.0
+
 import dataclasses
 from typing import Dict, Optional, Tuple
 
diff --git a/vllm/worker/multi_step_worker.py b/vllm/worker/multi_step_worker.py
index 1f982fe103366..3518ab2f64fed 100644
--- a/vllm/worker/multi_step_worker.py
+++ b/vllm/worker/multi_step_worker.py
@@ -1,3 +1,5 @@
+# SPDX-License-Identifier: Apache-2.0
+
 import dataclasses
 from dataclasses import dataclass
 from typing import Dict, List, Optional, Tuple
diff --git a/vllm/worker/neuron_model_runner.py b/vllm/worker/neuron_model_runner.py
index 596c26eac28bd..f2093fc42ad16 100644
--- a/vllm/worker/neuron_model_runner.py
+++ b/vllm/worker/neuron_model_runner.py
@@ -1,3 +1,5 @@
+# SPDX-License-Identifier: Apache-2.0
+
 import os
 from dataclasses import dataclass
 from importlib.util import find_spec
diff --git a/vllm/worker/neuron_worker.py b/vllm/worker/neuron_worker.py
index e02c72faace70..5f0eb0019eee2 100644
--- a/vllm/worker/neuron_worker.py
+++ b/vllm/worker/neuron_worker.py
@@ -1,3 +1,4 @@
+# SPDX-License-Identifier: Apache-2.0
 """A Neuron worker class."""
 from typing import List, Optional, Tuple
 
diff --git a/vllm/worker/openvino_model_runner.py b/vllm/worker/openvino_model_runner.py
index 42fe2cf668ad8..44442cddbd4a2 100644
--- a/vllm/worker/openvino_model_runner.py
+++ b/vllm/worker/openvino_model_runner.py
@@ -1,3 +1,5 @@
+# SPDX-License-Identifier: Apache-2.0
+
 from collections import defaultdict
 from typing import Dict, List, NamedTuple, Optional, Tuple
 
diff --git a/vllm/worker/openvino_worker.py b/vllm/worker/openvino_worker.py
index f5b46cde3969c..0690222d91afa 100644
--- a/vllm/worker/openvino_worker.py
+++ b/vllm/worker/openvino_worker.py
@@ -1,3 +1,4 @@
+# SPDX-License-Identifier: Apache-2.0
 """An OpenVINO worker class."""
 from typing import Any, Dict, List, Optional, Tuple
 
diff --git a/vllm/worker/pooling_model_runner.py b/vllm/worker/pooling_model_runner.py
index 6de227f3cb2b9..f43085b0e969a 100644
--- a/vllm/worker/pooling_model_runner.py
+++ b/vllm/worker/pooling_model_runner.py
@@ -1,3 +1,5 @@
+# SPDX-License-Identifier: Apache-2.0
+
 import dataclasses
 from typing import Any, Dict, List, Optional, Tuple, Type, Union
 
diff --git a/vllm/worker/tpu_model_runner.py b/vllm/worker/tpu_model_runner.py
index a3f648f4cc645..ecdf7aa888967 100644
--- a/vllm/worker/tpu_model_runner.py
+++ b/vllm/worker/tpu_model_runner.py
@@ -1,3 +1,5 @@
+# SPDX-License-Identifier: Apache-2.0
+
 import enum
 import time
 from dataclasses import dataclass
@@ -316,8 +318,8 @@ class TPUModelRunner(ModelRunnerBase[ModelInputForTPU]):
                     logger.info("batch_size: %d, seq_len: %d", batch_size,
                                 seq_len)
                     num_tokens = batch_size * seq_len
-                    if (num_tokens >=
-                            self.scheduler_config.max_num_batched_tokens):
+                    if (num_tokens
+                            >= self.scheduler_config.max_num_batched_tokens):
                         break
                     seq_len = seq_len * 2
             end = time.time()
diff --git a/vllm/worker/tpu_worker.py b/vllm/worker/tpu_worker.py
index ea0e700545b16..12f10169f2db6 100644
--- a/vllm/worker/tpu_worker.py
+++ b/vllm/worker/tpu_worker.py
@@ -1,3 +1,5 @@
+# SPDX-License-Identifier: Apache-2.0
+
 import os
 from typing import List, Optional, Tuple, Union
 
diff --git a/vllm/worker/utils.py b/vllm/worker/utils.py
index ffa8c4cb0ff46..d925f088357b5 100644
--- a/vllm/worker/utils.py
+++ b/vllm/worker/utils.py
@@ -1,3 +1,4 @@
+# SPDX-License-Identifier: Apache-2.0
 '''
 Worker-related helper functions.
 '''
diff --git a/vllm/worker/worker.py b/vllm/worker/worker.py
index 24bba79fedd75..582aa460eb4fa 100644
--- a/vllm/worker/worker.py
+++ b/vllm/worker/worker.py
@@ -1,3 +1,4 @@
+# SPDX-License-Identifier: Apache-2.0
 """A GPU worker class."""
 import gc
 import os
@@ -264,10 +265,7 @@ class Worker(LocalOrDistributedWorkerBase):
                f"{(available_kv_cache_memory / GiB_bytes):.2f}GiB.")
 
         logger.info(msg)
-
         # Final cleanup
-        if self.model_runner.lora_manager:
-            self.model_runner.remove_all_loras()
         gc.collect()
 
         return num_gpu_blocks, num_cpu_blocks
diff --git a/vllm/worker/worker_base.py b/vllm/worker/worker_base.py
index c6e6693c54f57..819b81fbfdbb2 100644
--- a/vllm/worker/worker_base.py
+++ b/vllm/worker/worker_base.py
@@ -1,3 +1,5 @@
+# SPDX-License-Identifier: Apache-2.0
+
 import dataclasses
 import os
 import time
@@ -8,7 +10,8 @@ import cloudpickle
 import torch
 import torch.nn as nn
 
-from vllm.config import ObservabilityConfig, VllmConfig
+from vllm.config import (ObservabilityConfig, VllmConfig,
+                         set_current_vllm_config)
 from vllm.distributed import broadcast_tensor_dict, get_pp_group, get_tp_group
 from vllm.logger import init_logger
 from vllm.lora.request import LoRARequest
@@ -498,8 +501,11 @@ class WorkerWrapperBase:
         group.
         """
         self.rpc_rank = rpc_rank
-        self.vllm_config = vllm_config
         self.worker: Optional[WorkerBase] = None
+        # do not store this `vllm_config`, `init_worker` will set the final
+        # one. TODO: investigate if we can remove this field in
+        # `WorkerWrapperBase`, `init_cached_hf_modules` should be
+        # unnecessary now.
         if vllm_config.model_config is not None:
             # it can be None in tests
             trust_remote_code = vllm_config.model_config.trust_remote_code
@@ -533,6 +539,9 @@ class WorkerWrapperBase:
         Arguments are passed to the worker class constructor.
         """
         kwargs = all_kwargs[self.rpc_rank]
+        self.vllm_config = kwargs.get("vllm_config", None)
+        assert self.vllm_config is not None, (
+            "vllm_config is required to initialize the worker")
         enable_trace_function_call_for_thread(self.vllm_config)
 
         from vllm.plugins import load_general_plugins
@@ -546,8 +555,10 @@ class WorkerWrapperBase:
                               bytes)
             worker_class = cloudpickle.loads(
                 self.vllm_config.parallel_config.worker_cls)
-        self.worker = worker_class(**kwargs)
-        assert self.worker is not None
+        with set_current_vllm_config(self.vllm_config):
+            # To make vLLM config available during worker initialization
+            self.worker = worker_class(**kwargs)
+            assert self.worker is not None
 
     def execute_method(self, method: Union[str, bytes], *args, **kwargs):
         try:
diff --git a/vllm/worker/xpu_model_runner.py b/vllm/worker/xpu_model_runner.py
index b7b7b7227b22c..9c726e1a107e0 100644
--- a/vllm/worker/xpu_model_runner.py
+++ b/vllm/worker/xpu_model_runner.py
@@ -1,3 +1,5 @@
+# SPDX-License-Identifier: Apache-2.0
+
 import dataclasses
 import time
 import weakref
diff --git a/vllm/worker/xpu_worker.py b/vllm/worker/xpu_worker.py
index e9cb623c8eb45..047c0bbbc3555 100644
--- a/vllm/worker/xpu_worker.py
+++ b/vllm/worker/xpu_worker.py
@@ -1,3 +1,4 @@
+# SPDX-License-Identifier: Apache-2.0
 """A XPU worker class."""
 import gc
 import os